def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) abs_list = [] lastid = None for line in file: if self.print_status: lc.update("Lemma Doc Stream") data = json.loads(line) doc_id = data['id'] xml = data['annotation'] if lastid != doc_id and len(abs_list) > 0: # Yield Stuff yield lastid, abs_list abs_list = [] lastid = doc_id token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): abs_list.append(token_cleaned[j]) if len(abs_list) > 0: # Yield Stuff yield lastid, abs_list
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) for line in file: if self.print_status: lc.update("Lemma Para Stream") data = json.loads(line) doc_id = data['id'] para_id = data['paragraphID'] xml = data['annotation'] token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) para_list = [] for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): para_list.append(token_cleaned[j]) yield doc_id, para_id, para_list
def __iter__(self): lc = LoopTimer(update_after=self.print_settings["update_after"], avg_length=self.print_settings['avg_length'], target=self.limit) for abstract_id in self.abstracts: path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn") if not os.path.isfile(path_to_annotation_file): print() print(abstract_id + " in db but missing file.") print() continue with open(path_to_annotation_file, "rb") as annotation_file: annotation = pickle.load(annotation_file) document = nlp_to_sent_token(annotation, token_type=self.token_type, clean=self.token_cleaned, lower=self.lower, bigrams=self.bigram, dictionary=self.dictionary) for sentence_id, sentence in enumerate(document): if self.print_status: lc.update("Yield Sentence") if self.output is None: yield sentence elif self.output == 'all': yield abstract_id, sentence_id, sentence
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) lastid = None for line in file: if self.print_status: lc.update("Posbigram Sent Stream") data = json.loads(line) xml = data['annotation'] id = data['id'] if lastid != id: para_num = 0 else: para_num += 1 lastid = id token_list = mf.xml2words(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: yield id, para_num, utils.makeBigrams(pos_cleaned)
def findphrasesbyrules(rf_rules): phrases_ = [] lc = LoopTimer(update_after=5000, avg_length=10000, target=len(dep_tree_dict)) for abstract_id, sentence_id in dep_tree_dict.keys(): dep_tree = dep_tree_dict[(abstract_id, sentence_id)] for rf_rule in rf_rules: phrases_.extend(pm.get_phrases(dep_tree, rf_rule)) lc.update("Find Phrases By Rules") return phrases_
def paragraph_splitter(dtype): dirname = os.path.dirname(__file__) json_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/json') file_list = sorted([ f for f in os.listdir(json_dir) if os.path.isfile(os.path.join(json_dir, f)) and f.endswith('.json') ]) max_para_title_words = 2 # Maximum Number of words for a line to be considered as a paragraph_title min_para_words = 10 # Minimum Number of words for a line to be considered as a paragraph for filename in file_list: cur_path = os.path.join(json_dir, filename) paragraph_file = os.path.join(json_dir, filename + '.para') if os.path.isfile(paragraph_file): os.remove(paragraph_file) print(filename) lt = LoopTimer() with open(paragraph_file, 'a') as wfile: with open(cur_path) as file: for file_line in file: data = json.loads(file_line) lines = data['paperAbstract'].split('\n') paragraph_title = '' paragraph_id = 0 new_data = {} empty_line_allowed = False for line in lines: # Check if line machtes a Paragraph-Title-Pattern if len(line) > 0 and ( len(line.split()) <= max_para_title_words or line == line.upper()): paragraph_title = line empty_line_allowed = True elif len(line) > 0 and (len(line.split()) >= min_para_words): new_data['paragraphContent'] = line new_data['year'] = data['year'] new_data['id'] = data['id'] new_data['paragraphTitle'] = paragraph_title new_data['paragraphID'] = paragraph_id json_string = json.JSONEncoder().encode(new_data) wfile.write(json_string + '\n') empty_line_allowed = False paragraph_id += 1 elif len(line) == 0 and not empty_line_allowed: paragraph_title = '' lt.update_after("Para-Split")
def rf_label_ssorc_paragraphs(): dirname = os.path.dirname(__file__) json_dir = os.path.join(dirname, '../../data/processed/ssorc/json') file_list = sorted([ f for f in os.listdir(json_dir) if os.path.isfile(os.path.join(json_dir, f)) and f.endswith('.para') ]) target_file_path = os.path.join( dirname, '../../data/processed/ssorc/rf_targets/targets.json') if os.path.isfile(target_file_path): os.remove(target_file_path) labels = dict() with open(os.path.join(dirname, '../../data/definitions/rf_labels.txt')) as rfdef: for line in rfdef: linesplit = line.split('\t') label = linesplit[0] candidates = linesplit[1].rstrip().split(',') labels[label] = [] for candidate in candidates: if len(candidate) > 0: candidate = candidate.lower() labels[label].append(candidate) labels[label].append(candidate + ":") with open(target_file_path, 'a') as target_file: for filename in file_list: cur_path = os.path.join(json_dir, filename) lt = LoopTimer() with open(cur_path) as file: for idx, file_line in enumerate(file): data = json.loads(file_line) title = data['paragraphTitle'].lower() rf_label = -1 for key in labels: if title in labels[key]: rf_label = key break if rf_label != -1: target_data = {} target_data['id'] = data['id'] target_data['paragraphID'] = data['paragraphID'] target_data['rflabel'] = rf_label json_string = json.JSONEncoder().encode(target_data) target_file.write(json_string + '\n') lt.update("RF Labeling")
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) for line in file: if self.print_status: lc.update("XML Para Stream") data = json.loads(line) xml = data['annotation'] doc_id = data['id'] para_num = data['paragraphID'] if xml.startswith('<?xml'): yield doc_id, para_num, xml
def build_scipy_feature_file(dtype, prefix='', num_samples=10000): dirname = os.path.dirname(__file__) dictionary_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/dictionaries') tfidf_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/tfidf') feature_file = os.path.join(dirname, '../../data/processed/' + dtype + '/features/tm_features_' + prefix + '.npz') if os.path.isfile(feature_file): os.remove(feature_file) lemma_dic = gensim.corpora.Dictionary.load(os.path.join(dictionary_dir, 'lemma.dic')) lemma_tfidf = gensim.models.TfidfModel.load(os.path.join(tfidf_dir, 'lemma_model.tfidf')) lemma_corpus = corpora.lemma_doc_stream(dtype) lt = LoopTimer() row = [] col = [] data = [] for idx, lemmas in enumerate(lemma_corpus): if num_samples != -1 and idx == num_samples: break lemma_bow = lemma_dic.doc2bow(lemmas[1]) vec_lemma_tfidf = lemma_tfidf[lemma_bow] for entry in vec_lemma_tfidf: row.append(idx) col.append(entry[0]) data.append(entry[1]) lt.update("Build Features") m = idx + 1 n = len(lemma_dic) row = np.array(row) col = np.array(col) data = np.array(data) feature_vector = scipy.sparse.csc_matrix((data, (row, col)), shape=(m, n)) scipy.sparse.save_npz(feature_file, feature_vector)
def __iter__(self): lc = LoopTimer(update_after=self.print_settings["update_after"], avg_length=self.print_settings['avg_length'], target=self.limit) for abstract_id in self.abstracts: path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn") if not os.path.isfile(path_to_annotation_file): print() print(abstract_id + " in db but missing file.") print() continue with open(path_to_annotation_file, "rb") as annotation_file: annotation = pickle.load(annotation_file) if self.print_status: lc.update("Yield Abstract") if self.output is None: yield annotation elif self.output == 'all': yield abstract_id, annotation
def make_dictionaries(dtype): dirname = os.path.dirname(__file__) inter_dir = os.path.join(dirname, '../../data/interim', dtype) word_dic = gensim.corpora.Dictionary() pos_dic = gensim.corpora.Dictionary() lemma_dic = gensim.corpora.Dictionary() wordbi_dic = gensim.corpora.Dictionary() posbi_dic = gensim.corpora.Dictionary() word_corpus = corpora.word_doc_stream(dtype) wordbigram_corpus = corpora.wordbigram_doc_stream(dtype) pos_corpus = corpora.pos_doc_stream(dtype) posbigram_corpus = corpora.posbigram_doc_stream(dtype) lemma_corpus = corpora.lemma_doc_stream(dtype) lt = LoopTimer() for word_doc, wordbigram_doc, pos_doc, posbigram_doc, lemma_doc in zip( word_corpus, wordbigram_corpus, pos_corpus, posbigram_corpus, lemma_corpus): lemma_dic.add_documents([lemma_doc[1]], prune_at=20000000) word_dic.add_documents([word_doc[1]], prune_at=20000000) wordbi_dic.add_documents([wordbigram_doc[1]], prune_at=20000000) pos_dic.add_documents([pos_doc[1]], prune_at=20000000) posbi_dic.add_documents([posbigram_doc[1]], prune_at=20000000) lt.update("Build Dictionaries") lemma_dic.save(os.path.join(inter_dir, 'full_lemma.dict')) wordbi_dic.save(os.path.join(inter_dir, 'full_wordbi.dict')) word_dic.save(os.path.join(inter_dir, 'full_word.dict')) posbi_dic.save(os.path.join(inter_dir, 'full_posbi.dict')) pos_dic.save(os.path.join(inter_dir, 'full_pos.dict')) print(word_dic) print(wordbi_dic) print(pos_dic) print(posbi_dic) print(lemma_dic)
def make_ssorc_data(): dirname = os.path.dirname(__file__) raw_dir = os.path.join(dirname, '../../data/raw/ssorc') file_list = sorted([ f for f in os.listdir(raw_dir) if os.path.isfile(os.path.join(raw_dir, f)) ]) json_dir = os.path.join(dirname, '../../data/processed/ssorc/json') for filename in file_list[1:2]: cur_path = os.path.join(raw_dir, filename) json_file = os.path.join(json_dir, filename + '.json') if os.path.isfile(json_file): os.remove(json_file) print(filename) lt = LoopTimer() with open(json_file, 'a') as wfile: with open(cur_path) as file: for idx, file_line in enumerate(file): data = json.loads(file_line) if ('year' in data) and ('paperAbstract' in data) and ('doi' in data): if (data['year'] != '') and (len(data['paperAbstract'].split()) > 50) and (data['doi'] != ''): if check_string_for_english(data['paperAbstract']): new_data = {} new_data['year'] = data['year'] new_data['paperAbstract'] = data[ 'paperAbstract'] new_data['id'] = data['doi'] jsonstring = json.JSONEncoder().encode( new_data) wfile.write(jsonstring + '\n') lt.update("Make Data")
def make_dblp_data(): dirname = os.path.dirname(__file__) raw_dir = os.path.join(dirname, '../../data/raw/dblp') data_abstract_file = os.path.join(raw_dir, 'data_abstracts.tsv') data_information_file = os.path.join(raw_dir, 'data_information.tsv') json_dir = os.path.join(dirname, '../../data/processed/dblp/json') json_file = os.path.join(json_dir, 'dblp.json') if os.path.isfile(json_file): os.remove(json_file) with open(data_information_file) as infofile: with open(data_abstract_file) as abstractfile: with open(json_file, 'a') as jfile: lt = LoopTimer() count = 0 for infoline, abstractline in zip(infofile, abstractfile): infodata = infoline.split('\t') abstractdata = abstractline.split('\t') infoID = infodata[0] infoDOI = infodata[1] infoTitle = infodata[2] infoAuthors = infodata[3] infoYear = int(infodata[4]) abstractID = abstractdata[0] abstractContent = abstractdata[1] if check_string_for_english(abstractContent): new_data = {} new_data['year'] = infoYear new_data['paperAbstract'] = abstractContent new_data['id'] = infoDOI jsonstring = json.JSONEncoder().encode(new_data) jfile.write(jsonstring + '\n') count += 1 lt.update(str(count) + " Abstracts added")
def make_deptree(mod_name, dep_tree_dict, dictionary, dep_type='basicDependencies', limit=2000): #dep_type = 'enhancedDependencies' #dep_type = 'enhancedPlusPlusDependencies' connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") sq1 = f"SELECT abstract_id FROM abstracts_ml WHERE entities LIKE '%machine learning%' AND annotated=1 LIMIT {limit}" cursor.execute(sq1) print("Collecting Abstracts") abstracts = set() for row in cursor: abstracts.add(row[0]) connection.close() print(f"{len(abstracts)} to build.") size = len(abstracts) annotations = AnnotationStream(abstracts=abstracts, output='all') lc = LoopTimer(update_after=10, avg_length=200, target=size) for abstract_id, annotation in annotations: for sentence in annotation['sentences']: dep_tree = sentence2tree(sentence, dictionary=dictionary, dep_type_=dep_type) sentence_id = int(sentence['index']) if dep_tree is not None: dep_tree_dict[(abstract_id, sentence_id)] = dep_tree lc.update("Build Dep Tree Dict") print() print(f"Size of Dictionary: {len(dictionary)}")
abstracts = set() abstract_labels = dict() for idx, row in enumerate(cursor): abstract_id = row[0] abstract_label = row[1] abstracts.add(abstract_id) abstract_labels[abstract_id] = abstract_label connection.close() corpus = corpora.TokenDocStream(abstracts=abstracts, token_type=token_type, print_status=True, output='all', lower=True) row = [] col = [] data = [] lt = LoopTimer() labels = list() for idx, document in enumerate(corpus): words = document[1] abstract_id = document[0] label = abstract_labels[abstract_id] labels.append(label) bow = dictionary.doc2bow(words) vec_tfidf = tfidf[bow] for entry in vec_tfidf: row.append(idx)
vector_len += posbigram_vec_len with open(target_path, 'rb') as target_file: label_dic = pickle.load(target_file) label_count = dict() for lkey in label_dic: label = label_dic[lkey] if label not in label_count: label_count[label] = 0 label_limit = limit / len(label_count) last_abstract_id = None lc = LoopTimer(update_after=5, avg_length=1000) sent_infos = list() max_sent = 0 breaker = 0 for abstract_id, row in df.iterrows(): word_sentence_tokens = [sentence.split(" ") for sentence in row['word'].split("\t")] pos_sentence_tokens = [sentence.split(" ") for sentence in row['pos'].split("\t")] for sent_id, (word_tokens, pos_tokens) in enumerate(zip(word_sentence_tokens, pos_sentence_tokens)): if (last_abstract_id is not None) and (last_abstract_id != abstract_id): for feature_data in sent_infos: did = feature_data['id'] sid = feature_data['sent_id'] label_key = (did, sid)
panda_path = "/media/norpheo/mySQL/db/ssorc/pandas" ml_words = set() ai_words = set() ml_abstracts = dict() ai_abstracts = dict() count = 0 filename = f"all_ent_{count}.dict" while os.path.isfile(os.path.join(panda_path, filename)): with open(os.path.join(panda_path, filename), "rb") as dict_file: dict_frame = pickle.load(dict_file) print(filename) lt = LoopTimer(update_after=50000, avg_length=50000, target=len(dict_frame)) for abstract_id in dict_frame: entities = dict_frame[abstract_id] if 'machine learning' in entities: ml_words.update(set(entities)) ml_abstracts[abstract_id] = entities if 'artificial intelligence' in entities: ai_words.update(set(entities)) ai_abstracts[abstract_id] = entities lt.update("Parse Dict") print()
suffix='.pickle') connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") with open(path_to_feature_file, "rb") as feature_file: documents = pickle.load(feature_file) pos = 0 neg = 0 lc = LoopTimer(update_after=50, avg_length=700, target=len(documents)) for abstract_id in documents: features = documents[abstract_id] prediction = mllr.predict_proba(features) if prediction[0][1] > 0.8: label = 1 pos += 1 else: label = -1 neg += 1 sql = f'UPDATE abstracts SET isML = {label} WHERE abstract_id = "{abstract_id}"' cursor.execute(sql) lc.update("Classify")
path_to_mlgenome = os.path.join(path_to_db, "mlgenome", nlp_model) path_to_mlgenome_features = os.path.join(path_to_mlgenome, "features") if not os.path.isdir(path_to_mlgenome_features): print(f"Create Directory {path_to_mlgenome_features}") os.mkdir(path_to_mlgenome_features) with open(os.path.join(path_to_mlgenome, "unique_mentions.pickle"), "rb") as handle: mentions = pickle.load(handle) feature_vector = [] lt_target = len(mentions) lt = LoopTimer(update_after=5000, avg_length=10000, target=lt_target) for mention in mentions: m_string = mention["string"] m_is_acronym = mention['is_acronym'] m_vec = mention['m_vec'] #if m_is_acronym: # continue feature_vector.append(m_vec) breaker = lt.update(f"Make Training-Set - {len(feature_vector)}") print(len(feature_vector)) feature_dict = dict()
"icdm", "kdd", "uai", "cvpr", "iclr", "wsdm", "aistats"] journal_dict = dict() venue_dict = dict() total_count = 0 ml_count = 0 ai_count = 0 lt = LoopTimer(update_after=50000, avg_length=500000, target=39*1000000+219709) for filename in file_list[0:]: cur_path = os.path.join(raw_dir, filename) print() print(filename) dictFrame = dict() with open(cur_path) as file: for idx, file_line in enumerate(file): data = json.loads(file_line) if all(key in data for key in req_keys): entities = [entity.lower() for entity in data['entities']] journal = data['journalName'].lower() venue = data['venue'].lower()
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) abstract_id_list = list() word_list = list() lemma_list = list() coarse_pos_list = list() fine_pos_list = list() ent_type_list = list() merged_word_list = list() merged_ent_type_list = list() targ = len(infoDF) not_found = 0 lt = LoopTimer(update_after=100, avg_length=10000, target=targ) for abstract_id, row in infoDF.iterrows(): file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy") if not os.path.isfile(file_path): not_found += 1 lt.update(f"Create Pandas - {len(abstract_id_list)}") continue doc = Doc(vocab).from_disk(file_path) abstract_id_list.append(abstract_id) word_list.append("\t\t".join([ "\t".join( [token.text for token in sentence if token_conditions(token)]) for sentence in doc.sents ]))
dbcon = DBConnector(db="ssorc") path_to_db = "/media/norpheo/mySQL/db/ssorc" path_to_annotations = os.path.join(path_to_db, "annotations") path_to_raw = os.path.join(path_to_db, "raw") connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") cursor.execute("SELECT abstract_id FROM abstracts WHERE annotated=0 and year>1990") lc = LoopTimer(update_after=1000) abstracts_to_process = set() for idx, row in enumerate(cursor): abstracts_to_process.add(row[0]) lc.update("Collect Abstracts to Process") connection.close() print() print("There are " + str(len(abstracts_to_process)) + " files to process") annotators = 'tokenize,ssplit,pos,lemma,depparse' splitter_annotators = 'ssplit' nlp = StanfordCoreNLP('../../../stanford-corenlp-full-2018-02-27') props = {'annotators': annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'} split_props = {'annotators': splitter_annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'}
feature_dict = pickle.load(feature_file) all_features = feature_dict["features"] all_targets = feature_dict["targets"] print("Feature-Vector-Shape: " + str(all_features.shape)) learning_features, holdback_features, learning_targets, holdback_targets = train_test_split( all_features, all_targets, test_size=0.4, random_state=4, shuffle=True) best_para = 0 best_score = 0 score_list = list() print("Start Training:") lc = LoopTimer(update_after=1, avg_length=5, target=len(reg_paras)) for c_para in reg_paras: #model = svm.SVC(decision_function_shape='ovo', # C=c_para, # kernel='rbf', # gamma='auto') model = svm.SVC(kernel="linear", C=c_para, decision_function_shape='ovo') scores = cross_val_score(model, learning_features, learning_targets, cv=10, n_jobs=-1) mean_score = scores.mean() if mean_score > best_score: best_score = mean_score
req_keys = ['title', 'authors', 'inCitations', 'outCitations', 'year', 'paperAbstract', 'id', 'entities', 'journalName', 'venue'] journal_dict = dict() venue_dict = dict() lt = LoopTimer(update_after=50000, avg_length=500000, target=39*1000000+219709) for filename in file_list[0:]: cur_path = os.path.join(raw_dir, filename) print() print(filename) dictFrame = dict() with open(cur_path) as file: for idx, file_line in enumerate(file): data = json.loads(file_line) if all(key in data for key in req_keys): entities = [entity.lower() for entity in data['entities']] journal = data['journalName'].lower() venue = data['venue'].lower()
import os import mysql.connector from src.utils.LoopTimer import LoopTimer path_to_ner = "/media/norpheo/mySQL/db/ssorc/NER" path_to_ml_algo_abstract_save = os.path.join(path_to_ner, "ml_algo_abstract_new.pickle") with open(path_to_ml_algo_abstract_save, "rb") as algo_abstract_file: ml_algo_abstract = pickle.load(algo_abstract_file) connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") lc = LoopTimer(update_after=100, avg_length=10000) for idx, abstract_id in enumerate(ml_algo_abstract): sq1 = f'INSERT INTO mlabstracts (abstract_id) VALUES("{abstract_id}") ON DUPLICATE KEY UPDATE abstract_id = ("{abstract_id}")' cursor.execute(sq1) lc.update("Insert Into") connection.commit() connection.close()
import os import pickle from src.utils.LoopTimer import LoopTimer path_to_db = "/media/norpheo/mySQL/db/ssorc" target_path = os.path.join(path_to_db, 'features', 'rf_targets_hl_sanity.pickle') label_path = os.path.join(path_to_db, 'rf_hand_labels', 'sanity_data.csv') targets = dict() label_set = dict() lc = LoopTimer(update_after=1000, avg_length=5000) with open(label_path, 'r') as label_file: for line in label_file: info = line.replace('\n', '').split('\t') if len(info) != 4: print(len(info)) print(line) continue abstract_id = info[0] sent_id = info[1] label = info[3] if label == '0': continue if label not in label_set: label_set[label] = 0
with open(os.path.join(path_to_mlgenome, "svm_mlares.pickle"), "rb") as model_file: classifier = pickle.load(model_file) with open(os.path.join(path_to_mlgenome, "mentions.pickle"), "rb") as handle: mentions = pickle.load(handle) mlares_clf = classifier["model"] lt_target = len(mentions) * len(mentions) print(f"Mentions*Mentions: {round(lt_target/1000000000, 3)} Billion") entity_linking = dict() breaker = 0 lt = LoopTimer(update_after=5000, avg_length=10000, target=lt_target) for mention in mentions: m_string = mention["mention_string"] m_tokens = mention["mention_tokens"] m_dvec = mention["doc_vector"] m_svec = mention["sentence_vector"] m_mvec = mention["mention_vector"] for candidate in mentions: c_string = candidate["mention_string"] c_tokens = candidate["mention_tokens"] c_dvec = candidate["doc_vector"] c_svec = candidate["sentence_vector"] c_mvec = candidate["mention_vector"] similarity = mlares.similarity_ratio(m_string, c_string)
ent_counter = dict() #forbidden_dep = ['csubj', 'nummod', 'cc', 'advmod', 'preconj', 'attr', 'det'] #forbidden_pos = ['VERB', 'ADP'] forbidden_dep = ['det', 'predet', 'nummod', 'cc', 'appos', 'punct', 'conj'] forbidden_pos = ['ADP', 'VERB', 'X', 'ADV'] forbidden_substrings = [ 'state-of-the-art', ',', '(', ')', "approaches", "approach", "algorithm", "algorithms", "based", "function", "functions", "other", "large", "larger", "twitter", "such" ] collect_ml = set() lt = LoopTimer(update_after=200, avg_length=2000, target=targ) for abstract_id, row in infoDF.iterrows(): ori_doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) for sent in ori_doc.sents: sentence = sent.as_doc() matches = matcher(sentence) ent_list = list() for match in matches: start = match[1] end = match[2]
track_topics = {"SVM": [166], "NeuralNetwork": [50, 114, 140] } tm_model = TopicModelingLDA(info_fn=tm_info_file_name) wordDF = pd.read_pickle(os.path.join(path_to_db, "pandas", "aiml_ner_merged_word.pandas")) infoDF = pd.read_pickle(os.path.join(path_to_db, "pandas", "ner_info_db.pandas")) df = infoDF.join(wordDF) timeseries = dict() for topic in track_topics: timeseries[topic] = dict() year_count = dict() lc = LoopTimer(update_after=100, avg_length=5000, target=len(df)) for abstract_id, row in df.iterrows(): year = row['year'] text = row['merged_word'].replace("\t\t", "\t") tokens = row['merged_word'].split("\t") topic_dist = tm_model.get_topic_dist(tokens) top_n_topics = topic_dist.argsort()[::-1][:5] if year not in year_count: year_count[year] = 0 for topic in track_topics: timeseries[topic][year] = 0 year_count[year] += 1
dependency = data['dependency'] rule = (trigger_word, dependency) if category not in rules: rules[category] = set() rules[category].add(rule) print("Loading Vocab...") vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab")) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) db_size = len(infoDF) lc = LoopTimer(update_after=100, avg_length=200, target=db_size) predictions = dict() targets = dict() target_vector = list() feature_vector = list() for idx, (abstract_id, df_row) in enumerate(infoDF.iterrows()): file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy") doc = Doc(vocab).from_disk(file_path) for n_sents, sent in enumerate(doc.sents): pass