Esempio n. 1
0
def extractVocab(annotators, output):
    vocab = defaultdict(int)
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        for prompt in ['q1', 'q2']:
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()

            #add sentences to the extrator for global feature extraction
            for d in aligner.responses:
                tokens = [token.lower() for token in d['response']]

                for token in tokens:
                    vocab[token] += 1
    fio.SaveDict2Json(vocab, output)
Esempio n. 2
0
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrase_annotation = task.get_phrase_annotation(prompt)

            #positive examples
            for rank1 in sorted(phrase_annotation):
                for rank2 in sorted(phrase_annotation):
                    if rank1 == rank2:
                        score = 1.0
                    else:
                        score = 0.0

                    phrases1 = phrase_annotation[rank1]
                    phrases2 = phrase_annotation[rank2]
                    for phrasedict1 in phrases1:
                        p1 = phrasedict1['phrase'].lower().strip()

                        for phrasedict2 in phrases2:
                            p2 = phrasedict2['phrase'].lower().strip()

                            featureset.append(
                                (feature_extractor.get_features(p1,
                                                                p2), score, {
                                                                    'p1': p1,
                                                                    'p2': p2
                                                                }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
Esempio n. 3
0
def extractPhraseFromAnnotationIntersect(phrasedir, annotators):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            print filename

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()
            extracted_phrases = aligner.get_intersect()

            fio.SaveList(extracted_phrases, filename)
Esempio n. 4
0
def get_phrase_reference_summary_phrase(outputs = None):
    
    for output in outputs:
        fio.NewPath(output)
        
        counts = []
        for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators, lectures=annotation.Lectures):
            print doc
            
            task = annotation.Task()
            task.loadjson(doc)
            
            sub_tasks = task.get_tasks()
            
            for sub_task in sub_tasks:
                if sub_task["task_name"] == "Phrase":
                    if sub_task['prompt'] == 0: #POI
                        type = 'q1'
                    else: 
                        type = 'q2'
                    
                    summary_filename = os.path.join(output, str(lec), type+'.ref.' + str(annotation.anotator_dict[annotator])) 
                    #summary_filename = os.path.join(output, str(lec), type+'.ref.summary') 
                    
                    print summary_filename
                    
                    summaries = [row[1] for row in sub_task["summary"][1:]]
                    colors = [row[0].strip()[1] for row in sub_task["summary"][1:]]
                    student_numbers = [row[2].strip() for row in sub_task["summary"][1:]]
                    
                    count = 0
                    for summary in summaries:
                        count += len(NLTKWrapper.wordtokenizer(summary))
                    
                    counts.append(count)
                    fio.SaveList(summaries, summary_filename)
                    
                    color_filename = os.path.join(output, str(lec), '%s.ref.%s.color'%(type, str(annotation.anotator_dict[annotator])))
                    fio.SaveList(colors, color_filename)
                    
                    no_filename = os.path.join(output, str(lec), '%s.ref.%s.no'%(type, str(annotation.anotator_dict[annotator])))
                    fio.SaveList(student_numbers, no_filename)
        
        print counts
        print numpy.mean(counts)
        print numpy.median(counts)
Esempio n. 5
0
def compare_length(annotator, output):
    students = set()

    lengthes = defaultdict(list)
    body = []
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotator,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        for prompt in ['q1', 'q2']:
            #for each lecture, prompt
            row = [lec, prompt]

            stu = set()

            #number of students
            dict = {}
            raw_responses = task.get_raw_response(prompt)
            for response_row in raw_responses[1:]:
                student_id, response = response_row[
                    'student_id'], response_row['response']
                student_id = student_id.lower()

                if student_id not in dict:
                    dict[student_id] = []
                dict[student_id].append(response)

            for stu in dict:

                lengthes[prompt].append(len(' '.join(dict[stu]).split()))

    import stats_util
    print '%s\t%f\t%f\t%f' % (
        course, np.mean(lengthes['q1']), np.mean(lengthes['q2']),
        stats_util.ttest(lengthes['q1'], lengthes['q2'], 2, 2)[-1])
Esempio n. 6
0
def get_phrase_reference_summary_phrase_no(outputs = None):
    
    Numbers = []
    
    counts = []
    for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators[:1], lectures=annotation.Lectures):
        print doc
        
        task = annotation.Task()
        task.loadjson(doc)
        
        sub_tasks = task.get_tasks()
        
        for sub_task in sub_tasks:
            if sub_task["task_name"] == "Phrase":
                if sub_task['prompt'] == 0: #POI
                    type = 'q1'
                else: 
                    type = 'q2'
                
                student_numbers = [row[2].strip() for row in sub_task["summary"][1:]]
                Numbers += [int(x) for x in student_numbers]
                    
    fio.SaveDict2Json(Numbers, '../data/%s_supporters.txt'%global_params.g_cid)
Esempio n. 7
0
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotator,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        #Add a cache to make it faster
        Cache = {}
        cachefile = phrasedir + str(lec) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method

            if summarydir:
                fio.NewPath(os.path.join(summarydir, str(lec)))
                summary_file = os.path.join(summarydir, str(lec),
                                            '%s.summary' % prompt)

            body = []

            if summarydir:
                summaries = []

            phrase_summary_dict = task.get_phrase_summary_textdict(prompt)
            extracted_phrases = []
            phrase_annotation = task.get_phrase_annotation(prompt)
            for rank in sorted(phrase_annotation):
                rank_phrases = []
                phrases = phrase_annotation[rank]
                for phrasedict in phrases:
                    phrase = phrasedict['phrase'].lower()
                    extracted_phrases.append(phrase)
                    rank_phrases.append(phrase)
                    row = [phrase, rank]
                    body.append(row)

                if summarydir:
                    rank_summary = phrase_summary_dict[rank]
                    max_summary = get_max_phrase_by_ROUGE(
                        rank_summary, rank_phrases, Cache)
                    print max_summary

                    summaries.append(max_summary)

            fio.SaveList(extracted_phrases, filename)

            fio.WriteMatrix(cluster_output, body, header=None)

            if summarydir:
                fio.SaveList(summaries, summary_file)

            with open(cachefile, 'w') as outfile:
                json.dump(Cache, outfile, indent=2)
Esempio n. 8
0
def extractStatistics(annotator, output):

    students = set()

    body = []
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotator,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        for prompt in ['q1', 'q2']:
            #for each lecture, prompt
            row = [lec, prompt]

            stu = set()

            #number of students
            wc = 0.0
            dict = {}
            raw_responses = task.get_raw_response(prompt)
            for response_row in raw_responses[1:]:
                student_id, response = response_row[
                    'student_id'], response_row['response']
                student_id = student_id.lower()

                if student_id not in dict:
                    dict[student_id] = []
                dict[student_id].append(response)
                students.add(student_id)
                stu.add(student_id)

                wc += len(response.split())

            response_number = len(dict)
            row.append(response_number)  #number of responses
            row.append(wc)  #word count
            row.append(wc /
                       response_number)  #averaged number of words per response

            phrase_summary_dict = task.get_phrase_summary_textdict(prompt)
            extracted_phrases = []
            phrase_annotation = task.get_phrase_annotation(prompt)

            stu_h = set()
            ph_c = 0
            for rank in sorted(phrase_annotation):
                phrases = phrase_annotation[rank]
                ph_c += len(phrases)
                for phrasedict in phrases:
                    phrase = phrasedict['phrase'].lower()  #phrase
                    extracted_phrases.append(phrase)

                    student_id = phrasedict['student_id'].lower().strip()
                    stu_h.add(student_id)

            row.append(ph_c)  #phrase count
            coverage = stu.intersection(stu_h)
            coverage_ratio = len(coverage) * 1.0 / len(stu)
            row.append(coverage_ratio)

            body.append(row)

    #add average
    head = [
        'lec', 'prompt', 'Response', 'Word', 'Word/Response', 'Highlights',
        'Coverage'
    ]

    row = ['', 'ave']
    for i in range(2, len(head)):
        scores = [float(xx[i]) for xx in body]
        row.append(np.mean(scores))
    body.append(row)

    #add std
    row = ['', 'std']
    for i in range(2, len(head)):
        scores = [float(xx[i]) for xx in body]
        row.append(np.std(scores))
    body.append(row)

    fio.WriteMatrix(output, body, head)

    print(len(students))
Esempio n. 9
0
def extractPhraseFromSyntax(extractiondir, annotators):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #         if lec != 11: continue

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = extractiondir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.feature.crf'
            print filename

            fout = codecs.open(filename, "w", "utf-8")

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()

            for d in aligner.responses:
                tokens = [token.lower() for token in d['response']]
                tags = d['tags'][0]

                colors = d['colors']

                n_tokens = []
                n_tags = []

                for token, tag in zip(tokens, tags):
                    if len(token) == 0: continue

                    n_tokens.append(token)
                    n_tags.append(tag)

                if len(n_tokens) == 0: continue

                tokens = n_tokens
                tags = n_tags

                body = []

                words = tokens
                N = len(tokens)

                #first row: the word token
                for word in words:
                    row = []
                    row.append(word)
                    body.append(row)

                #add the color
                for color in colors:
                    for i, tag in enumerate(tags):
                        body[i].append(str(color[i]))

                #last row:
                tags = [tag for tag in tags]

                for i, tag in enumerate(tags):
                    body[i].append(tag)

                #extract the NP tags
                psg_tags = getSennaPSGtags(tokens)
                for i, tag in enumerate(psg_tags):
                    body[i].append(tag)

                for row in body:
                    fout.write(' '.join(row))
                    fout.write('\n')
                fout.write('\n')

            fout.close()

        if debug:
            break
Esempio n. 10
0
def extractPhraseFeatureFromCombine(extractiondir, annotators, empty='N'):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #print lec

        #if lec != 17: continue

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = extractiondir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:

            #if prompt != 'q2': continue

            filename = path + prompt + '.feature.crf'
            print filename

            fout = codecs.open(filename, "w", "utf-8")

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()

            crf_feature_extractor = CRF_Extractor()

            #add sentences to the extrator for global feature extraction
            for d in aligner.responses:
                tokens = [token.lower() for token in d['response']]
                colors = d['colors']

                if d['tags'][0] == d['tags'][1]:
                    combinetags = [d['tags'][0]]
                else:
                    combinetags = [d['tags'][0], d['tags'][1]]

                for tags in combinetags:
                    n_tokens = []
                    n_tags = []

                    for token, tag in zip(tokens, tags):
                        if len(token) == 0: continue

                        n_tokens.append(token)
                        n_tags.append(tag)

                    if len(n_tokens) == 0: continue

                    tokens = n_tokens
                    tags = n_tags

                    crf_feature_extractor.add_sentence((tokens, tags, colors))

            for tokens, tags, colors in crf_feature_extractor.sentences:
                if empty == 'Y':
                    flag = True
                    for tag in tags:
                        if tag != 'O': flag = False
                    if flag: continue

                body = crf_feature_extractor.extract_crf_features(
                    tokens, tags, prompt, colors)

                for row in body:
                    fout.write(' '.join(row))
                    fout.write('\n')
                fout.write('\n')

            fout.close()