def get_docs_stats(): logging.root.setLevel(logging.ERROR) acumm_stats_before = [0 for _ in voz.Document.get_stats_labels()] acumm_stats_after = [0 for _ in voz.Document.get_stats_labels()] acumm_count = 0 logging.basicConfig(level=logging.WARNING) file_path = settings.STY_FILE_PATH for sty_file in settings.STY_FILES: acumm_count +=1 logger.info("Processing %s" % sty_file) doc = styhelper.create_document_from_sty_file(file_path+sty_file) doc_stats = doc.get_stats() for i in xrange(len(acumm_stats_before)): acumm_stats_before[i]+=doc_stats[i] quoted_speech_file = sty_file.split()[0] + "/sentences.tsv.csv" quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file) quotedspeechhelper.clean_quoted_speech_from_document(doc) doc_stats = doc.get_stats() for i in xrange(len(acumm_stats_before)): acumm_stats_after[i]+=doc_stats[i] #break print "Counts" #print voz.Document.format_stats(acumm_stats_before) print voz.Document.format_stats(acumm_stats_after) print "Averages" for i in xrange(len(acumm_stats_before)): #acumm_stats_before[i]=1.0*acumm_stats_before[i]/acumm_count acumm_stats_after[i]=1.0*acumm_stats_after[i]/acumm_count #print voz.Document.format_stats(acumm_stats_before) print voz.Document.format_stats(acumm_stats_after)
def generate_filtered_text_files(): """ Generate files to be processed by parsers. ClearNLP: source /Users/josepvalls/soft/clearnlp/setup_classpath.sh java -Xmx5g -XX:+UseConcMarkSweepGC edu.emory.clir.clearnlp.bin.NLPDecode -mode ner -c config_decode_ner.xml -i /Users/josepvalls/voz2/stories/dialog_filtered -ie txt ClearNLP Coref java /Users/josepvalls/Dropbox/projects/clearnlp/src/main/java/edu/drexel/valls Stanford CoreNLP use CoreNLP server cache Open NLP sh /Users/josepvalls/Dropbox/projects/coref-opennlp/Coref/run.cmd """ logging.basicConfig(level=logging.DEBUG) file_path = settings.STY_FILE_PATH for sty_file in settings.STY_FILES: logger.info("Processing %s" % sty_file) quoted_speech_file = sty_file.split()[0] + "/sentences.csv" doc = styhelper.create_document_from_sty_file(file_path + sty_file) quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file) sentences = [] for sentence in doc.sentences: if sentence.is_normal(): sentences.append(sentence.get_text() + '\n') file_name = settings.STORY_TXT_PATH + str(doc.id) + '.txt' logger.info("Writing %d sentences to %s" % (len(sentences), file_name)) with open(file_name, 'w') as f: f.writelines(sentences)
def get_docs_stats(feature_group,feature_distribution): tsv = None arff = None idxlst = '' #logging.root.setLevel(logging.ERROR) file_path = settings.STY_FILE_PATH documents = [] #for sty_file in []: for sty_file in settings.STY_FILES[14:]: #for sty_file in settings.STY_FILES: #for sty_file in ['03 - Bukhtan Bukhtanovich.sty']: try: 0/0 doc = voz.create_document_from_jsonpickle_file('/Users/josepvalls/temp/voz2/'+sty_file+'.json') logger.info("Loading JSON %s" % sty_file) except: logger.info("Processing %s" % sty_file) quoted_speech_file = sty_file.split()[0] + "/sentences.csv" quoted_speech_file = "all_sentences.tsv" doc = styhelper.create_document_from_sty_file(file_path+sty_file) if DO_REMOVE_DIALOG: quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file, format='tsv', single_sentences_file_story_id=doc.id) quotedspeechhelper.clean_quoted_speech_from_document(doc) doc.serialize_to_file(TEMP_CACHE_PATH+sty_file+'.json',use_deep_copy=True) # print util.string_as_print(doc.id,doc.properties.get('afanasev_new',doc.id),doc.properties.get('afanasev_old',doc.id), doc.narrative.format_summary()) documents.append(doc) if False and not DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT: for document_id in [1001,1002,1003,1004,2001]: #for document_id in [1004]: documents.append(oldannotationhelper.load_old_annotations_into_document(document_id)) for doc in documents: import narrativehelper narrativehelper.VERB_FEATURES = feature_group narrativehelper.DO_COMPUTE_ROLE_DISTRIBUTION = feature_distribution narrativehelper.DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT = DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT assert isinstance(doc,voz.Document) doc.narrative.filter_non_actual_default = DO_FILTER_NONACTUAL doc.narrative.compute_features() print doc.id,"Narrative: ",doc.narrative.format(options={'one_liner':True,'use_function_group':True}) continue print sum([i.tokens_count for i in doc.narrative.function_list]) if DO_WRITE_FILES: for _ in doc.narrative.functions(): idxlst += "%d\n" % doc.id if not tsv: tsv = doc.narrative.format_tsv() arff = doc.narrative.format_arff() idxlst else: tsv += doc.narrative.format_tsv(False) arff += doc.narrative.format_arff(False) if DO_PRINT_TO_SCREEN: #print doc.id for function in doc.narrative.functions(): print doc.id,function.get_feature_vector() if DO_WRITE_FILES: open('tool_corpus_functions_summary/story_indices%s%s.txt' % (('_filtered' if DO_FILTER_NONACTUAL else ''),('_nodiag' if DO_REMOVE_DIALOG else '')),'w').write(idxlst) open('tool_corpus_functions_summary/tool_corpus_functions_summary_%d_%s%s%s%s.tsv' % (feature_group,'dist' if feature_distribution else 'abs','_filtered' if DO_FILTER_NONACTUAL else '','_auto' if DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT else '','_nodiag' if DO_REMOVE_DIALOG else ''), 'w').write(tsv) open('tool_corpus_functions_summary/tool_corpus_functions_summary_%d_%s%s%s%s.arff' % (feature_group,'dist' if feature_distribution else 'abs','_filtered' if DO_FILTER_NONACTUAL else '','_auto' if DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT else '','_nodiag' if DO_REMOVE_DIALOG else ''), 'w').write(arff)
def get_stats_docs_verbs(): docs = [] for sty_file in settings.STY_FILES: doc = styhelper.create_document_from_sty_file(settings.STY_FILE_PATH + sty_file) quotedspeechhelper.annotate_sentences(doc, settings.STORY_ALL_SENTENCES, format='tsv', single_sentences_file_story_id=doc.id) docs.append(doc) print sum([len(i.get_all_verbs()) for i in docs]) print sum([sum([len([k for k in j._objects if k and k.is_independent]) + len( [k for k in j._subjects if k and k.is_independent]) for j in i.get_all_verbs()]) for i in docs])
def generate_filtered_entity_file(): logging.basicConfig(level=logging.DEBUG) file_path = settings.STY_FILE_PATH mentions = [] for sty_file in settings.STY_FILES[2:3]: logger.info("Processing %s" % sty_file) quoted_speech_file = sty_file.split()[0] + "/sentences.csv" doc = styhelper.create_document_from_sty_file(file_path + sty_file) quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file) for sentence in doc.sentences: assert (isinstance(sentence, voz.Sentence)) if sentence.annotations.is_normal(): for mention in sentence.mentions: mentions.append(mention.get_text().lower() + '\n') file_name = '/Users/josepvalls/voz2/stories/finlayson-entities.txt' logger.info("Writing %d mentions to %s" % (len(mentions), file_name)) with open(file_name, 'w') as f: f.writelines(mentions)
def main_print_stats(): len_quotes = 0 len_sentences = 0 len_verbs = 0 len_mentions = 0 len_pp = 0 len_pn = 0 len_tokens = 0 len_tokens_in_quotes = 0 for story_file in settings.STY_FILES: print story_file doc = styhelper.create_document_from_sty_file(settings.STY_FILE_PATH + story_file) #styhelper.fix_sty_annotations(doc) quotedspeechhelper.annotate_sentences( doc, settings.STORY_ALL_SENTENCES, single_sentences_file_story_id=doc.id) output_tuple = tokenize_document(doc) output, quotes, mentions, verbs = output_tuple print tokenized_string_to_string(output, 1) len_quotes += len(quotes) len_verbs += len(verbs) len_mentions += len(mentions) len_sentences += len(doc.sentences) len_pp += len( [i for i in mentions if [j for j in i.tokens if j.pos == 'PRP']]) len_pn += len( [i for i in mentions if [j for j in i.tokens if j.pos == 'NNP']]) len_tokens += len(doc.get_text()) len_tokens_in_quotes += sum([q.offset_end - q.offset for q in quotes]) print 'TOTAL NUM QUOTES\t', len_quotes print 'TOTAL NUM SENT\t', len_sentences print 'TOTAL NUM VERBS\t', len_verbs print 'TOTAL NUM MENT\t', len_mentions print 'TOTAL NUM PP\t', len_pp print 'TOTAL NUM PN\t', len_pn print 'TOTAL NUM chars\t', len_tokens print 'TOTAL NUM chars in quotes\t', len_tokens_in_quotes
def create_document_using_stanford_from_filtered_sty_file(sty_file): import styhelper, quotedspeechhelper, entitymanager stats_not_found = 0 stats_ambiguous = 0 stats_match_ok = 0 logger.info("Processing %s" % sty_file) doc = styhelper.create_document_from_sty_file(sty_file) quotedspeechhelper.annotate_sentences( doc, settings.STORY_ALL_SENTENCES, format='tsv', single_sentences_file_story_id=doc.id) text = "\n".join([ sentence.get_text() for sentence in doc.sentences if sentence.annotations.is_normal() ]) doc_new = create_document_from_raw_text(text, {'story_id': doc.id + 1000}) assert len([ sentence for sentence in doc.sentences if sentence.annotations.is_normal() ]) == len( doc_new.sentences ), "Sentence length mismatch between annotated and processed document" fixed_annotation_file = settings.STORY_ANNOTATION_FIXES + '%d.tsv' % doc_new.id if not os.path.isfile(fixed_annotation_file): # Dump data for fixing f_fixes = open(fixed_annotation_file, 'w') for sentence in [ sentence for sentence in doc.sentences if sentence.annotations.is_normal() ]: mentions_check = [ i for i in sentence.mentions if len([j for j in i.tokens if j.pos != 'DT']) > 1 ] mentions_check = sorted(mentions_check, key=lambda i: (len(i.child_mentions) * 100 - i.id), reverse=True) while mentions_check: mention = mentions_check.pop(0) assert isinstance(mention, entitymanager.Mention) f_data = mention.get_text() + "\t" + str( mention.get_taxonomy( entitymanager.TaxonomyContainer.TAXONOMY_ENTITY_TYPES) ) + ' ' + str( mention.get_taxonomy( entitymanager.TaxonomyContainer. TAXONOMY_CHARACTER_6ROLES)) + ' ' + str( mention.get_coref_group_id()) f_data = "\t%d\t%d\t%s\n" % (doc_new.id, mention.id, f_data) f_fixes.write(f_data) for mention_ in mention.child_mentions: f_data = mention_.get_text() + "\t" + str( mention_.get_taxonomy( entitymanager.TaxonomyContainer. TAXONOMY_ENTITY_TYPES)) + ' ' + str( mention_.get_taxonomy( entitymanager.TaxonomyContainer. TAXONOMY_CHARACTER_6ROLES)) + str( mention.get_coref_group_id()) f_data = "\t%d\t%d\t - %s\n" % (doc_new.id, mention_.id, f_data) f_fixes.write(f_data) try: mentions_check.remove(mention_) except: pass f_fixes.close() # Annotate fixed_annotation_file_extra = settings.STORY_ANNOTATION_FIXES + '%d-extra.tsv' % doc_new.id if not os.path.isfile(fixed_annotation_file_extra): f_fixes = open(fixed_annotation_file_extra, 'w') else: f_fixes = None for sentence_ref, sentence in zip([ sentence for sentence in doc.sentences if sentence.annotations.is_normal() ], doc_new.sentences): assert isinstance(sentence, voz.Sentence) for mention in sentence.mentions: if not mention.is_independent: continue assert isinstance(mention, entitymanager.Mention) tokens_ref = [sentence_ref.tokens[i.idx] for i in mention.tokens] mentions_ref = set( filter(None, [ sentence_ref._parent_document.get_mention_by_token_id(i.id) for i in tokens_ref ])) if not mentions_ref: logger.warning("UNABLE TO FIND ANNOTATION FOR MENTION %s" % mention.get_text()) if f_fixes: f_fixes.write( "%d\tMISS\t%s\t%s\n" % (mention.id, mention.get_text(), str(mention))) stats_not_found += 1 continue elif not len(mentions_ref) == 1: logger.warning("AMBIGUOUS ANNOTATION FOR MENTION") stats_ambiguous += 1 mentions_ref = sorted(mentions_ref, key=lambda i: len(i.tokens)) for i in mentions_ref: if mention_ref.get_taxonomy( entitymanager.TaxonomyContainer. TAXONOMY_CHARACTER_6ROLES): mention_ref = i break if f_fixes: f_fixes.write("%d\tAMBG\t%s\t%s\t%s\n" % (mention.id, mention.get_text(), [str(i) for i in mentions_ref], mention_ref)) else: mention_ref = mentions_ref.pop() stats_match_ok += 1 if len( mention_ref.get_taxonomy(entitymanager.TaxonomyContainer. TAXONOMY_ENTITY_TYPES)) > 1: logger.info( util.string_as_print( "POTENTIALLY IGNORE", mention_ref, mention_ref.get_taxonomy( entitymanager.TaxonomyContainer. TAXONOMY_ENTITY_TYPES))) mention.annotations.split_ignore = True mention.annotations.coref = mention_ref.get_coref_group_id() mention.annotations.type = \ (mention_ref.get_taxonomy(entitymanager.TaxonomyContainer.TAXONOMY_ENTITY_TYPES) or ['NA'])[0] mention.annotations.role = \ (mention_ref.get_taxonomy(entitymanager.TaxonomyContainer.TAXONOMY_CHARACTER_6ROLES) or ['NA'])[0] sentence.annotations.verbs = sentence_ref.verbs if f_fixes: f_fixes.close() #print stats_not_found, stats_ambiguous, stats_match_ok return doc_new
def get_verbs(): logging.root.setLevel(logging.ERROR) file_path = settings.STY_FILE_PATH verbs = [] frames = [] functions = collections.defaultdict(list) import verbmanager mapper = verbmanager.VerbMapper(verbmanager.VerbMapper.MODE_FRAMENET_TEXT) for sty_file in settings.STY_FILES: try: 0 / 0 doc = voz.create_document_from_jsonpickle_file( '/Users/josepvalls/temp/voz2/' + sty_file + '.json') logger.info("Loading JSON %s" % sty_file) except: logger.info("Processing %s" % sty_file) quoted_speech_file = sty_file.split()[0] + "/sentences.csv" doc = styhelper.create_document_from_sty_file(file_path + sty_file) assert isinstance(doc, voz.Document) if DO_REMOVE_DIALOG: quotedspeechhelper.annotate_sentences( doc, file_path + quoted_speech_file) quotedspeechhelper.clean_quoted_speech_from_document(doc) doc.serialize_to_file('/Users/josepvalls/temp/voz2/' + sty_file + '.json', use_deep_copy=True) #print len(doc.get_all_tokens()) logger.info( util.string_as_print(doc.id, doc.properties.get('afanasev_new', doc.id), doc.properties.get('afanasev_old', doc.id), doc.narrative.format_summary())) assert isinstance(doc, voz.Document) doc.narrative.compute_features() print sum([ f.tokens_count for f in doc.narrative.functions(filter_non_actual=False) ]) continue for f in doc.narrative.functions: assert isinstance(f, voz.narrativehelper.NarrativeFunction) #functions[f.function_group].extend([i.token.lemma for i in f._verbs]) functions[f.function_group].extend([ mapper.map(i.token.lemma, fallback=False) for i in doc.get_all_verbs() ]) verbs.extend([i.token.text for i in doc.get_all_verbs()]) #frames.update([i.frame for i in doc.get_all_verbs()]) #frames.extend(filter(None,[mapper.map(i.token.lemma,fallback=False) for i in doc.get_all_verbs()])) frames.extend([ mapper.map(i.token.lemma, fallback=False) for i in doc.get_all_verbs() ]) #break sys.exit() roots = util.flatten( util.flatten([[i.root_hypernyms() for i in wn.synsets(verb, 'v')] for verb in verbs])) print len(verbs), len(set(verbs)) print len(frames), len(set(frames)) print len(roots) print collections.Counter(roots).most_common() print collections.Counter(frames).most_common() print collections.Counter(verbs).most_common() pprint.pprint(functions) vozbase.serialize_to_file([verbs, frames, functions], '/Users/josepvalls/temp/voz2/verbs.json', False, False) mapper.save_cache()
def make_coreferences(): documents = [] file_path = settings.STY_FILE_PATH vars_names = ['p1','r1','f1','p0','r0','f0','length','length**2','count1','count0','mentions_characters','char_uniq','coref_groups','c/gr','gr/c','eval'] num_vars = len(vars_names) matrices_to_compute = ['OLD_STANFORD_COREF','OLD_NAME_COREF']+['OLD_RESTRICTION','OLD_TYPE']+['OLD_IDX'] matrices_to_merge = ['OLD_STANFORD_COREF','OLD_NAME_COREF']+['OLD_RESTRICTION','OLD_TYPE'] # OLD_ROLE_PRED1 # OLD_ROLE_GT #matrices_to_print = matrices_to_compute+["AGGREGATED"] matrices_to_print = ["AGGREGATED"] cumulative = dict([(i,[0.0]*num_vars) for i in matrices_to_compute+["AGGREGATED"]]) for sty_file in settings.STY_FILES: logger.info("Processing %s" % sty_file) quoted_speech_file = sty_file.split()[0]+"/sentences.csv" doc = styhelper.create_document_from_sty_file(file_path+sty_file) assert isinstance(doc,voz.Document) quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file) quotedspeechhelper.clean_quoted_speech_from_document(doc) doc.coreference_aux[voz.entitymanager.TaggableContainer.TAG_CHARACTER_SYMBOL] = doc.coreference mentions = doc.get_all_mentions() mentions = [i for i in mentions if i.is_independent] mentions = [i for i in mentions if 'CH' in i.get_taxonomy(voz.entitymanager.TaxonomyContainer.TAXONOMY_NONCHARACTER)] '''# create stanford, name, roles coref for coref_key in matrices_to_eval: coref_ids = sorted([i for i in set(util.flatten([mention.get_tag(coref_key) for mention in mentions]))]) print "mentions, coref_ids",len(mentions),len(coref_ids),coref_ids doc.coreference_aux[coref_key] = voz.entitymanager.Coreference(doc) for coref_id in coref_ids: mentions_coref = [i for i in mentions if coref_id in i.get_tag(coref_key)] doc.coreference_aux[coref_key].create_coref_group_and_entity_from_mentions(doc.get_symbol_id(coref_id,'COREF_SYMBOL'),coref_id,mentions_coref) # eval coref print voz.Document.format_stats(doc.get_stats())''' # eval the individual matrices and compute their table for aggregation later tables_to_merge = [] table_gt_temp = None for coref_key in matrices_to_compute: print coref_key table,individual = voz.entitymanager.Coreference.eval_prf(coref_key,mentions) if table_gt_temp is None: table_gt_temp = table if coref_key in matrices_to_merge: tables_to_merge.append((table,individual)) for i in xrange(num_vars): cumulative[coref_key][i]+=individual[i] # aggregate the tables and evaluate aggregation coref_key = "AGGREGATED" merge_matrices(mentions,tables_to_merge,table_gt_temp) table,individual = voz.entitymanager.Coreference.eval_prf(coref_key,mentions) for i in xrange(num_vars): cumulative[coref_key][i]+=individual[i] #break # sty_file for j in matrices_to_print: for i in xrange(num_vars): cumulative[j][i]=cumulative[j][i]/cumulative[j][7] print 'CUMMULATIVE OVER STORIES' for j in matrices_to_print: print j for i in xrange(num_vars): print "%s\t%f" % (vars_names[i],cumulative[j][i]) for i in xrange(num_vars-3,num_vars): print "%s\t%f" % (vars_names[i],cumulative[j][i]/15.0) avg = 1.0 * (cumulative[j][2]*cumulative[j][8]+cumulative[j][3]*cumulative[j][9])/(cumulative[j][8]+cumulative[j][9]) print "error\t%f\t%f" % (avg,1-avg)