def getVideos(root): ''' get all videos in the root directory and all its subdirectories or the root if it is a video file''' videos = [] if path.isfile(root): dirpath, nameWithExt = path.split(root) name, ext = path.splitext(nameWithExt) if ext[1:] in common.videoTypes: videos.append( common.videoToDict(dirpath, name, common.tokenize(name))) else: raise Exception('File is not a known video format') else: for (dirpath, dirname, filenames) in os.walk(root): # get all files with extension from videoTypes, leading dot is # stripped from the extension for f in filenames: name, ext = path.splitext(f) if ext[1:] in common.videoTypes: videos.append(common.videoToDict( dirpath, name, common.tokenize(name))) if len(videos) == 0: raise Exception( 'No file with known video format found in the directory') return videos
def transform(self, documents): documents = [tokenize(d) for d in documents] documents = [d[:self.max_page_size] for d in documents] documents = [' '.join(d) for d in documents] if self.encoding_type in ['tfidf', 'count', 'binary']: return self.vectorizer.transform(documents).toarray() if self.encoding_type == 'lda': documents_tokenized = [tokenize(i) for i in documents] other_corpus = [ self.common_dictionary.doc2bow(i) for i in documents_tokenized ] results = [] for i in other_corpus: result = self.vectorizer[i] result = vectorize_topic_models(result, self.num_of_topics) results.append(result) return np.array(results) if self.encoding_type in ['doc2vec']: documents_tokenized = [tokenize(i) for i in documents] results = [] for i in documents_tokenized: if i: try: results.append(self.vectorizer[i][0]) except KeyError: results.append([0 for _ in range(self.encoding_size)]) else: results.append([0 for _ in range(self.encoding_size)]) return np.array(results) if self.encoding_type in ['fasttext']: documents_clean = [clean_text(i) for i in documents] results = [] for i in documents_clean: if i: results.append(self.vectorizer.get_sentence_vector(i)) # results.append(self.vectorizer[i]) else: results.append( np.array([0 for _ in range(self.encoding_size)])) return np.array(results)
def convert_format(self): sentences = [] words = pkl.load(open("LM_corpura//%s//%s" % (cfg['lm_corpus'], cfg['corpus__dict_file']), 'rb')) word_dict = dict([(word, key) for key, word in enumerate(words, 1)]) #word_dict = common.get_word_dict(self.conf['index2word_path']) if self.name == "pos_tagging": tags = set([tag for word, tag in treebank.tagged_words()]) tag_index = {tag: idx for idx, tag in enumerate(tags, 1)} for sentence in treebank.tagged_sents(): sent_words = [(word_dict[common.tokenize(w)], tag_index[t]) if common.tokenize(w) in word_dict else (0, tag_index[t]) for w, t in sentence] sentences.append(sent_words) return sentences
def read_data(self, tagged_sentences): features = [] tags = [] for sentence in tagged_sentences: sent_words = [ self.word_dict[common.tokenize(w)] if common.tokenize(w) in self.word_dict else len(self.words) for w, t in sentence ] sent_tags = [t for w, t in sentence] sent_representations = self.lm.predict(np.asarray(sent_words)) sent_representations = np.squeeze(sent_representations, axis=1) features.append(sent_representations) tags.append(sent_tags) all_tags = set(np.concatenate(tags)) tags_enum = [(tag, idx) for idx, tag in enumerate(all_tags)] tag_dict = dict(tags_enum) tags = [[tag_dict[tag] for tag in sent_tags] for sent_tags in tags] #features = np.concatenate(features) tags = np.asarray(tags) tags = keras.utils.to_categorical(tags) X_train, X_test, y_train, y_test = train_test_split(features, tags, test_size=0.2, random_state=42) for size in [0.01, 0.05, 0.1, 0.2]: print("Using %d examples" % (int(X_train.shape[0] * size))) svm = SVC(kernel='linear') svm.fit(X_train[:int(X_train.shape[0] * size)], y_train[:int(X_train.shape[0] * size)]) score = svm.score(X_test, y_test) print("for %.2f%% of the data: %.2f%% accuracy" % (100 * size, 100 * score))
def process_html(r_text, r_time, url, timestamp, file_name): if r_text: record = generate_link_dict(url) soup = BeautifulSoup(r_text, 'lxml') new_links = [i['href'] for i in soup.find_all('a', href=True)] new_abs_links = [i for i in new_links if is_link_external(i, record['netloc'])] record['page_external_links'] = str(new_abs_links) record['request_time'] = r_time record['request_timestamp'] = timestamp meta_data = get_meta_info_from_html(r_text) page_text = get_text_from_html(r_text) record['html_char_len'] = len(r_text) record['text_char_len'] = len(page_text) record['meta_char_len'] = len(meta_data) record['html_word_len'] = len(tokenize(r_text)) record['text_word_len'] = len(tokenize(page_text)) record['meta_word_len'] = len(tokenize(meta_data)) with open(f'{dir_loc}/all_html_chunks/{file_name}.txt', 'a') as f: f.write(f'{url}{sep_char}{str(r_text).replace(sep_char, "")}' + "\n") with open(f'{dir_loc}/all_meta_chunks/{file_name}.txt', 'a') as f: f.write(f'{url}{sep_char}{str(meta_data).replace(sep_char, "")}' + "\n") with open(f'{dir_loc}/all_text_chunks/{file_name}.txt', 'a') as f: f.write(f'{url}{sep_char}{str(page_text).replace(sep_char, "")}' + "\n") record['file_name'] = str(file_name) record_df = pd.DataFrame.from_dict([record]) record_df = record_df.set_index('url') while True: try: with sqlite3.connect(f'{dir_loc}/dbs/{db_name}') as conn_disk: record_df.to_sql('websites', conn_disk, if_exists='append', index=True) break except sqlite3.OperationalError: time.sleep(5) print('db locked')
def train(self, corpus_filename: str): word_freq = {} p = Path(corpus_filename) with open(corpus_filename) as f: parenstack = [] words_pos = [] line_number = 1 for line in f: line_number += 1 tokens = common.tokenize(line) tokens_in_node = [] for token in tokens: if token == '(': parenstack.append('(') tokens_in_node = [] elif token == ')': parenstack.pop() if len(tokens_in_node) == 2: words_pos.append(tuple(tokens_in_node)) pos, word = tokens_in_node if not word in word_freq: word_freq[word] = {} word_freq[word][pos] = 1 else: if not pos in word_freq[word]: word_freq[word][pos] = 1 else: word_freq[word][pos] += 1 tokens_in_node = [] else: tokens_in_node.append(token) p = Path(self.PICKLE_FILE) with p.open('wb') as output_file: pickle.dump(word_freq, output_file, pickle.HIGHEST_PROTOCOL)
print('辞書を読み込みました:', DICT_PATH) tfidf = models.TfidfModel.load(TFIDF_MODEL_PATH) print('TF-IDFモデルを読み込みました:', TFIDF_MODEL_PATH) clf = joblib.load(SVC_MODEL_PATH) print('SVM学習モデルを読み込みました:', SVC_MODEL_PATH) print('') print('予測したいニュースタイトルを入力してください...') print('') try: for line in sys.stdin: title = line.rstrip('\r\n') documents = [tokenize(title)] bow_corpus = [dictionary.doc2bow(doc) for doc in documents] tfidf_corpus = tfidf[bow_corpus] X = [ matutils.corpus2dense([corpus], num_terms=len(dictionary)).T[0] for corpus in tfidf_corpus ] result = clf.predict(X)[0] print('-----') print('入力:', title) print('予測:', CATEGORIES[result]) print('') except KeyboardInterrupt: print('===== 終了 =====')
def test_normalize(): suffixsymbs = { 'high': '++', 'medium': '+~', 'low': '+-', 'positive': '+', 'negative': '-' } suffixsyns = { 'high': 'high', 'hi': 'high', 'bright': 'high', 'Bright': 'high', 'bri': 'high', 'br': 'high', '(high)': 'high', 'medium': 'medium', 'med': 'medium', 'intermediate': 'medium', 'int': 'medium', '(medium)': 'medium', 'low': 'low', 'lo': 'low', 'LO': 'low', 'dim': 'low', 'di': 'low', '(low)': 'low', 'positive': 'positive', 'negative': 'negative' } gate_mappings = { 'Alexa350': 'http://purl.obolibrary.org/obo/PR_001', 'Alexa750': 'http://purl.obolibrary.org/obo/PR_002', 'Annexin': 'http://purl.obolibrary.org/obo/PR_003', 'B220-_live': 'http://purl.obolibrary.org/obo/PR_004', 'CCR7': 'http://purl.obolibrary.org/obo/PR_005', 'CD14': 'http://purl.obolibrary.org/obo/PR_006', 'CD16': 'http://purl.obolibrary.org/obo/PR_007', 'CD19': 'http://purl.obolibrary.org/obo/PR_008', 'CD20': 'http://purl.obolibrary.org/obo/PR_009', 'CD21': 'http://purl.obolibrary.org/obo/PR_010', 'CD24': 'http://purl.obolibrary.org/obo/PR_011', 'CD27': 'http://purl.obolibrary.org/obo/PR_012', 'CD3': 'http://purl.obolibrary.org/obo/PR_013', 'CD33': 'http://purl.obolibrary.org/obo/PR_014', 'CD38': 'http://purl.obolibrary.org/obo/PR_015', 'CD4': 'http://purl.obolibrary.org/obo/PR_016', 'CD44': 'http://purl.obolibrary.org/obo/PR_017', 'CD45RA': 'http://purl.obolibrary.org/obo/PR_018', 'CD4_T_cells': 'http://purl.obolibrary.org/obo/PR_019', 'CD56': 'http://purl.obolibrary.org/obo/PR_020', 'CD69': 'http://purl.obolibrary.org/obo/PR_021', 'CD8': 'http://purl.obolibrary.org/obo/PR_022', 'CD94': 'http://purl.obolibrary.org/obo/PR_023', 'CXCR5': 'http://purl.obolibrary.org/obo/PR_024', 'doublet_excluded': 'http://purl.obolibrary.org/obo/PR_025', 'ICOS': 'http://purl.obolibrary.org/obo/PR_026', 'IFNg': 'http://purl.obolibrary.org/obo/PR_027', 'IL2': 'http://purl.obolibrary.org/obo/PR_028', 'live': 'http://purl.obolibrary.org/obo/PR_029', 'Live_cells': 'http://purl.obolibrary.org/obo/PR_030', 'Lymph': 'http://purl.obolibrary.org/obo/PR_031', 'Lymphocytes': 'http://purl.obolibrary.org/obo/PR_032', 'lymphocytes': 'http://purl.obolibrary.org/obo/PR_033', 'Michael': 'http://purl.obolibrary.org/obo/PR_034', 'NP_tet': 'http://purl.obolibrary.org/obo/PR_035', 'PD1': 'http://purl.obolibrary.org/obo/PR_036', 'Robert': 'http://purl.obolibrary.org/obo/PR_037', 'singlets': 'http://purl.obolibrary.org/obo/PR_038', 'small_lymphocyte': 'http://purl.obolibrary.org/obo/PR_039', 'SSC': 'http://purl.obolibrary.org/obo/PR_040', 'TNFa': 'http://purl.obolibrary.org/obo/PR_041', 'Uninfected': 'http://purl.obolibrary.org/obo/PR_042', 'viable': 'http://purl.obolibrary.org/obo/PR_043', } special_gates = { 'Michael': { 'Ontology ID': 'PR:034', 'Synonyms': 'mike, mickey, mick', 'Toxic Synonym': 'mikey' }, 'Robert': { 'Ontology ID': 'PR:037', 'Synonyms': 'rob, bob, bert', 'Toxic Synonym': 'bobert' } } preferred = { 'http://purl.obolibrary.org/obo/PR_001': 'Axexa350', 'http://purl.obolibrary.org/obo/PR_002': 'Alexa750', 'http://purl.obolibrary.org/obo/PR_003': 'Annexin', 'http://purl.obolibrary.org/obo/PR_004': 'B220-_live', 'http://purl.obolibrary.org/obo/PR_005': 'CCR7', 'http://purl.obolibrary.org/obo/PR_006': 'CD14', 'http://purl.obolibrary.org/obo/PR_007': 'CD16', 'http://purl.obolibrary.org/obo/PR_008': 'CD19', 'http://purl.obolibrary.org/obo/PR_009': 'CD20', 'http://purl.obolibrary.org/obo/PR_010': 'CD21', 'http://purl.obolibrary.org/obo/PR_011': 'CD24', 'http://purl.obolibrary.org/obo/PR_012': 'CD27', 'http://purl.obolibrary.org/obo/PR_013': 'CD3', 'http://purl.obolibrary.org/obo/PR_014': 'CD33', 'http://purl.obolibrary.org/obo/PR_015': 'CD38', 'http://purl.obolibrary.org/obo/PR_016': 'CD4', 'http://purl.obolibrary.org/obo/PR_017': 'CD44', 'http://purl.obolibrary.org/obo/PR_018': 'CD45RA', 'http://purl.obolibrary.org/obo/PR_019': 'CD4_T_cells', 'http://purl.obolibrary.org/obo/PR_020': 'CD56', 'http://purl.obolibrary.org/obo/PR_021': 'CD69', 'http://purl.obolibrary.org/obo/PR_022': 'CD8', 'http://purl.obolibrary.org/obo/PR_023': 'CD94', 'http://purl.obolibrary.org/obo/PR_024': 'CXCR5', 'http://purl.obolibrary.org/obo/PR_025': 'doublet_excluded', 'http://purl.obolibrary.org/obo/PR_026': 'ICOS', 'http://purl.obolibrary.org/obo/PR_027': 'IFNg', 'http://purl.obolibrary.org/obo/PR_028': 'IL2', 'http://purl.obolibrary.org/obo/PR_029': 'live', 'http://purl.obolibrary.org/obo/PR_030': 'Live_cells', 'http://purl.obolibrary.org/obo/PR_031': 'Lymph', 'http://purl.obolibrary.org/obo/PR_032': 'Lymphocytes', 'http://purl.obolibrary.org/obo/PR_033': 'lymphocytes', 'http://purl.obolibrary.org/obo/PR_035': 'NP_tet', 'http://purl.obolibrary.org/obo/PR_036': 'PD1', 'http://purl.obolibrary.org/obo/PR_038': 'singlets', 'http://purl.obolibrary.org/obo/PR_039': 'small_lymphocyte', 'http://purl.obolibrary.org/obo/PR_040': 'SSC', 'http://purl.obolibrary.org/obo/PR_041': 'TNFa', 'http://purl.obolibrary.org/obo/PR_042': 'Uninfected', } reported = 'CD14-CD56-CD3+CD4+CD8-CD45RA+CCR7+' tokenized = tokenize('LaJolla', suffixsymbs, suffixsyns, reported) assert tokenized == [ 'CD14-', 'CD56-', 'CD3+', 'CD4+', 'CD8-', 'CD45RA+', 'CCR7+' ] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:006-', 'PR:020-', 'PR:013+', 'PR:016+', 'PR:022-', 'PR:018+', 'PR:005+' ] assert preferized == [ 'CD14-', 'CD56-', 'CD3+', 'CD4+', 'CD8-', 'CD45RA+', 'CCR7+' ] reported = 'CD3-, CD19+, CD20-, CD27hi, CD38hi' tokenized = tokenize('Emory', suffixsymbs, suffixsyns, reported) assert tokenized == ['CD3-', 'CD19+', 'CD20-', 'CD27++', 'CD38++'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:013-', 'PR:008+', 'PR:009-', 'PR:012++', 'PR:015++' ] assert preferized == ['CD3-', 'CD19+', 'CD20-', 'CD27++', 'CD38++'] reported = 'CD3-/CD19+/CD20lo/CD38hi/CD27hi' tokenized = tokenize('IPIRC', suffixsymbs, suffixsyns, reported) assert tokenized == ['CD3-', 'CD19+', 'CD20+-', 'CD38++', 'CD27++'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:013-', 'PR:008+', 'PR:009+-', 'PR:015++', 'PR:012++' ] assert preferized == ['CD3-', 'CD19+', 'CD20+-', 'CD38++', 'CD27++'] reported = 'CD21hi/CD24int' tokenized = tokenize('Watson', suffixsymbs, suffixsyns, reported) assert tokenized == ['CD21++', 'CD24+~'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:010++', 'PR:011+~'] assert preferized == ['CD21++', 'CD24+~'] reported = 'Annexin negative' tokenized = tokenize('Ltest', suffixsymbs, suffixsyns, reported) assert tokenized == ['Annexin-'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:003-'] assert preferized == ['Annexin-'] reported = 'CD3+ AND CD4+ AND small lymphocyte' tokenized = tokenize('VRC', suffixsymbs, suffixsyns, reported) assert tokenized == ['CD3+', 'CD4+', 'small_lymphocyte'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:013+', 'PR:016+', 'PR:039'] assert preferized == ['CD3+', 'CD4+', 'small_lymphocyte'] reported = 'Lymphocytes and CD8+ and NP tet+' tokenized = tokenize('Ertl', suffixsymbs, suffixsyns, reported) assert tokenized == ['Lymphocytes', 'CD8+', 'NP_tet+'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:032', 'PR:022+', 'PR:035+'] assert preferized == ['Lymphocytes', 'CD8+', 'NP_tet+'] reported = 'Activated T: viable/singlets/Lymph/CD3+' tokenized = tokenize('Stanford', suffixsymbs, suffixsyns, reported) assert tokenized == ['viable', 'singlets', 'Lymph', 'CD3+'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:043', 'PR:038', 'PR:031', 'PR:013+'] assert preferized == ['!viable', 'singlets', 'Lymph', 'CD3+'] # TODO: Is this right? reported = 'CD14-CD33-/CD3-/CD16+CD56+/CD94+' tokenized = tokenize('Stanford', suffixsymbs, suffixsyns, reported) assert tokenized == ['CD14-', 'CD33-', 'CD3-', 'CD16+', 'CD56+', 'CD94+'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:006-', 'PR:014-', 'PR:013-', 'PR:007+', 'PR:020+', 'PR:023+' ] assert preferized == ['CD14-', 'CD33-', 'CD3-', 'CD16+', 'CD56+', 'CD94+'] # TODO: Is this right? reported = 'Live cells/CD4 T cells/CD4+ CD45RA-/Uninfected/SSC low' tokenized = tokenize('Mayo', suffixsymbs, suffixsyns, reported) assert tokenized == [ 'Live_cells', 'CD4_T_cells', 'CD4+', 'CD45RA-', 'Uninfected', 'SSC+-' ] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:030', 'PR:019', 'PR:016+', 'PR:018-', 'PR:042', 'PR:040+-' ] assert preferized == [ 'Live_cells', 'CD4_T_cells', 'CD4+', 'CD45RA-', 'Uninfected', 'SSC+-' ] reported = 'B220- live,doublet excluded,CD4+ CD44highCXCR5highPD1high,ICOS+' tokenized = tokenize('New York Influenza', suffixsymbs, suffixsyns, reported) assert tokenized == [ 'B220-_live', 'doublet_excluded', 'CD4+', 'CD44++', 'CXCR5++', 'PD1++', 'ICOS+' ] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:004', 'PR:025', 'PR:016+', 'PR:017++', 'PR:024++', 'PR:036++', 'PR:026+' ] assert preferized == [ 'B220-_live', 'doublet_excluded', 'CD4+', 'CD44++', 'CXCR5++', 'PD1++', 'ICOS+' ] reported = 'lymphocytes/singlets/live/CD19-CD14-/CD3+/CD8+/CD69+IFNg+IL2+TNFa+' tokenized = tokenize('New York Influenza', suffixsymbs, suffixsyns, reported) assert tokenized == [ 'lymphocytes', 'singlets', 'live', 'CD19-', 'CD14-', 'CD3+', 'CD8+', 'CD69+', 'IFNg+', 'IL2+', 'TNFa+' ] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == [ 'PR:033', 'PR:038', 'PR:029', 'PR:008-', 'PR:006-', 'PR:013+', 'PR:022+', 'PR:021+', 'PR:027+', 'PR:028+', 'PR:041+' ] assert preferized == [ 'lymphocytes', 'singlets', 'live', 'CD19-', 'CD14-', 'CD3+', 'CD8+', 'CD69+', 'IFNg+', 'IL2+', 'TNFa+' ] reported = 'Alexa350 (high) + Alexa750 (medium)' tokenized = tokenize('Modeling Viral', suffixsymbs, suffixsyns, reported) assert tokenized == ['Alexa350++', 'Alexa750+~'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:001++', 'PR:002+~'] assert preferized == ['Axexa350++', 'Alexa750+~'] reported = 'TNFa+IFNg-' tokenized = tokenize('Flow Cytometry Analysis', suffixsymbs, suffixsyns, reported) assert tokenized == ['TNFa+', 'IFNg-'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:041+', 'PR:027-'] assert preferized == ['TNFa+', 'IFNg-'] reported = 'Mikeyhigh/RobLO/Alexa350 (high)/CD33+ý' tokenized = tokenize('Some Project', suffixsymbs, suffixsyns, reported) assert tokenized == ['Mikey++', 'Rob+-', 'Alexa350++', 'CD33+-'] preferized, ontologized = normalize(tokenized, gate_mappings, special_gates, preferred, suffixsymbs.values()) assert ontologized == ['PR:034++', 'PR:037+-', 'PR:001++', 'PR:014+-'] assert preferized == ['Michael++', 'Robert+-', 'Axexa350++', 'CD33+-']
def train(self, corpus_filename: str): """Calculates emission and transition model. t means tag, w means word. Emission: P(w[i] | t[i]) = C(t[i], w[i]) / C(t[i]) Transition: P(t[i] | t[i-1]) = C(t[i-1], t[i]) / C(t[i-1]) Where C(t) counts the occurrences of t """ model = {} pos_words = {} p = Path(corpus_filename) with open(corpus_filename) as f: parenstack = []; bigrams = {} bigrams[self.END_STR] = {} word_counts = {} for line in f: tokens = common.tokenize(line) tokens_in_node = [] for token in tokens: if token == '(': parenstack.append('(') tokens_in_node = [] elif token == 'S': prev = self.START_STR elif token == ')': parenstack.pop() if not parenstack: if not prev in bigrams[self.END_STR]: bigrams[self.END_STR][prev] = 1 else: bigrams[self.END_STR][prev] += 1 elif len(tokens_in_node) == 2: pos, word = tokens_in_node if pos != "-NONE-": if not pos in bigrams: bigrams[pos] = {} if not prev in bigrams[pos]: bigrams[pos][prev] = 1 else: bigrams[pos][prev] += 1 prev = pos if not pos in pos_words: pos_words[pos] = {} pos_words[pos][word] = 1 else: if not word in pos_words[pos]: pos_words[pos][word] = 1 else: pos_words[pos][word] += 1 tokens_in_node = [] if not word in word_counts: word_counts[word] = 1 else: word_counts[word] += 1 else: tokens_in_node.append(token) # normalize transition model for pos, d in bigrams.items(): total = sum(d.values()) for prev, count in d.items(): bigrams[pos][prev] = count / total # rare words are counted as a single word pos_words2 = copy.deepcopy(pos_words) for pos, d in pos_words.items(): for word, count in d.items(): if count <= self.UNKNOWN_TRESHOLD: pos_words2[pos].pop(word) if not self.UNKNOWN_STR in pos_words2[pos]: pos_words2[pos][self.UNKNOWN_STR] = 1 else: pos_words2[pos][self.UNKNOWN_STR] += 1 pos_words = pos_words2 # normalize emission model for pos, d in pos_words.items(): total = sum(d.values()) for word, count in d.items(): pos_words[pos][word] = count / total model["bigrams"] = bigrams model["pos_words"] = pos_words p = Path(self.PICKLE_FILE) with p.open('wb') as output_file: pickle.dump(model, output_file, pickle.HIGHEST_PROTOCOL)
def fit(self, documents): documents = [tokenize(d) for d in documents] documents = [d[:self.max_page_size] for d in documents] documents = [' '.join(d) for d in documents] if self.encoding_type in ['tfidf', 'count', 'binary']: if self.encoding_type == 'tfidf': self.vectorizer = CountVectorizer( ngram_range=(self.min_n_gram, self.max_n_gram), max_features=self.max_vocab_size, binary=False, max_df=self.max_df, analyzer=self.tokenizer_level) self.vectorizer.fit(documents) if self.encoding_type == 'count': self.vectorizer = CountVectorizer( ngram_range=(self.min_n_gram, self.max_n_gram), max_features=self.max_vocab_size, binary=False, max_df=self.max_df, analyzer=self.tokenizer_level) self.vectorizer.fit(documents) if self.encoding_type == 'binary': self.vectorizer = CountVectorizer( ngram_range=(self.min_n_gram, self.max_n_gram), max_features=self.max_vocab_size, binary=False, max_df=self.max_df, analyzer=self.tokenizer_level) self.vectorizer.fit(documents) with open(self.save_file_loc, 'wb') as f: pickle.dump(self.vectorizer, f) if self.encoding_type == 'lda': documents_tokenized = [tokenize(i) for i in documents] self.common_dictionary = Dictionary(documents_tokenized) common_corpus = [ self.common_dictionary.doc2bow(text) for text in documents_tokenized ] self.vectorizer = ldamodel.LdaModel(common_corpus, id2word=self.common_dictionary, num_topics=self.num_of_topics, passes=self.vectorizer_epochs) self.vectorizer.save(self.save_file_loc) if self.encoding_type == 'doc2vec': tagged_documents = [ TaggedDocument(tokenize(doc), [i]) for i, doc in enumerate(documents) ] self.vectorizer = Doc2Vec(tagged_documents, vector_size=self.encoding_size, window=2, min_count=1, workers=4, epochs=self.vectorizer_epochs, max_vocab_size=100000) self.vectorizer.delete_temporary_training_data( keep_doctags_vectors=True, keep_inference=True) self.vectorizer.save(self.save_file_loc) if self.encoding_type == 'fasttext': with open(self.fasttext_training_file_location, 'w') as f: for i in documents: f.write(clean_text(i) + '\n') self.vectorizer = fasttext.train_unsupervised( self.fasttext_training_file_location, model=self.fasttext_algorithm, dim=self.encoding_size) self.vectorizer.save_model(self.save_file_loc)
parser = ArgumentParser() parser.add_argument('-d', '--data', default='data') args = parser.parse_args() in_dir = join(args.data, 'raw/blogs/') in_paths = glob(in_dir + "*xml") out_dir = join(args.data, 'proc/blogs/') x = [] y = [] for path in in_paths: if 'male' not in path: continue xml = read_xml(path) posts = xml.xpath(".//post/text()") is_female = 'female' in path for post in posts: encoding = tokenize(post) if encoding.shape[0] <= min_sequence_length: continue if encoding.shape[0] < sequence_length: encoding = pad(encoding, sequence_length) label = 1 if is_female else 0 x.append(encoding[:sequence_length]) y.append(label) makedirs(out_dir, exist_ok=True) np.save(join(out_dir, 'x.npy'), np.vstack(x)) np.save(join(out_dir, 'y.npy'), np.array(y))
from gensim import corpora, models, matutils from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.externals import joblib from sklearn.svm import SVC from common import CATEGORIES, DICT_PATH, TFIDF_MODEL_PATH, SVC_MODEL_PATH, tokenize, load_documents titles, labels = load_documents() print('教師データを読み込みました: titles=%s labels=%s' % (len(titles), len(labels))) print('') documents = [tokenize(title) for title in titles] print('形態素解析によりトークン化しました: ドキュメント数=%s' % (len(documents))) print('') dictionary = corpora.Dictionary(documents) dictionary.filter_extremes(no_above=0.7) print('辞書を作成しました: ユニークトークン数=%s' % (len(dictionary))) print('') dictionary.save(DICT_PATH) print('辞書を保存しました:', DICT_PATH) print('') bow_corpus = [dictionary.doc2bow(doc) for doc in documents] print('BOWコーパスを作成しました:', bow_corpus[0]) print('') tfidf = models.TfidfModel(bow_corpus) print('TF-IDFモデルを作成しました') print('')
def tokenizer(text): tokens = tokenize(text) tokens = stemwords(tokens) return tokens
['\nFrom:', '\nTo:', '- Original Message -', '-----------'] ] reply_indices = [index for index in reply_indices if index > -1] if len(reply_indices) > 0: # remove older messages, signatures, etc. text = text[:min(reply_indices) - 5] lines = text.split('\n') non_quote_lines = [line for line in lines if not line.startswith('>')] # remove quotes text = '\n'.join(non_quote_lines) for name in names: # remove own name as indicator for gender (e.g. in signature) if len(name) > 2: text = sub(name, '', text, flags=IGNORECASE) text = text.replace( '?', '' ) # replace question mark, as it is also used for unknown characters and redacted text encoding = tokenize(text) if encoding.shape[0] <= min_sequence_length: continue if encoding.shape[0] < sequence_length: encoding = pad(encoding, sequence_length) x.append(encoding[:sequence_length]) label = 1 if 'female' in gender else 0 y.append(label) checksums.append(checksum) x = np.vstack(x) y = np.array(y) makedirs(out_dir, exist_ok=True) np.save(join(out_dir, 'x'), x) np.save(join(out_dir, 'y'), y)
def predict(text): x = tokenize(text) with graph.as_default(): probability = model.predict(np.array([x], ))[0][0] return probability
# a dictionary of the form { term: { docid: freq } } term_dictionary = defaultdict(Counter) all_doc_ids = sorted(map(int, os.listdir(input_directory))) doc_size = dict() for doc_id in all_doc_ids: print "Trying to index doc %s..." % doc_id filepath = os.path.join(input_directory, str(doc_id)) with open(filepath) as input_file: document_content = input_file.read() unique_terms = 0 for term in tokenize(document_content): if doc_id not in term_dictionary[term]: unique_terms += 1 term_dictionary[term][doc_id] += 1 doc_size[doc_id] = unique_terms # Formats the posting list for a specific term # - input: a posting of the form { doc_id: freq, doc_id: freq } # - return: a formatted posting string with "doc_id:freq doc_id:freq" def format_posting_list(posting): sorted_doc_ids = sorted(posting) posting_strings = []
def extract_grammar(filename: str, args): with open(filename, "r") as corpus_file: pos_stack = [] paren_stack = [] words_pos = defaultdict(dd) prev = None root = None rules = [] terminals = set() for line in corpus_file: tokens = common.tokenize(line) for token in tokens: if token == "(": paren_stack.append("(") elif token == ")": while True: if not paren_stack: return "bad" else: el = paren_stack.pop() if pos_stack: pos_stack.pop() if el == "(": break if not paren_stack: save_rule(root, rules) pos_stack = [] paren_stack = [] prev = None root = None else: if prev == "(": node = Node(token) if pos_stack: pos_stack[-1].children.append(node) else: root = node pos_stack.append(node) elif prev != "-NONE-": # previous token was a POS, token can only be a terminal terminals.add(prev) words_pos[token][prev] += 1 prev = token # probabilities from frequencies for word, d in words_pos.items(): total = sum(d.values()) for pos, count in d.items(): words_pos[word][pos] = count / total MOST_COMMON_COUNT = 1000 sorted_rules = Counter(rules).most_common() for rule in sorted_rules[:MOST_COMMON_COUNT]: same_lhs = filter(lambda r: r[0].lhs == rule[0].lhs, sorted_rules) total = sum([r[1] for r in same_lhs]) rule[0].prob = rule[1] / total grammar_rules = [rule[0] for rule in sorted_rules[:MOST_COMMON_COUNT]] grammar = Grammar(grammar_rules, terminals, words_pos) if args.mode == "pre": print("{:>5} | {}".format("count", "rule")) for rule in sorted_rules[:MOST_COMMON_COUNT]: print("{:>5} | {}".format(rule[1], rule[0])) sum_first = sum( [rule[1] for rule in sorted_rules[:MOST_COMMON_COUNT]]) sum_rest = sum( [rule[1] for rule in sorted_rules[MOST_COMMON_COUNT:]]) print("{} rules total:".format(len(rules))) print("{:>8} (first {})".format(sum_first, MOST_COMMON_COUNT)) print("{:>8} (rest)".format(sum_rest)) print("{:.3f}% coverage".format(100.0 * sum_first / len(rules))) assert len(rules) == sum_first + sum_rest else: for rule in grammar.rules: print(rule) p = Path(PICKLE_FILE) with p.open('wb') as output_file: pickle.dump(grammar, output_file, pickle.HIGHEST_PROTOCOL) return grammar
def main(): # Define command-line parameters parser = argparse.ArgumentParser( description='Normalize cell population descriptions') parser.add_argument( 'excluded', type=argparse.FileType('r'), help='a TSV file with experiment accessions to be ignored') parser.add_argument( 'scale', type=argparse.FileType('r'), help='a TSV file with the value scale (e.g. high, low, negative)') parser.add_argument( 'mappings', type=argparse.FileType('r'), help='a TSV file which maps gate labels to ontology ids/keywords') parser.add_argument( 'special', type=argparse.FileType('r'), help='a TSV file containing extra information about a subset of gates') parser.add_argument( 'preferred', type=argparse.FileType('r'), help='a TSV file which maps ontology ids to preferred labels') parser.add_argument('cells', type=argparse.FileType('r'), help='an OWL file for the Cell Ontology') parser.add_argument('source', type=argparse.FileType('r'), help='the source data TSV file') parser.add_argument('output', type=str, help='the output TSV file') # Parse command-line parameters args = parser.parse_args() # Load the contents of the file given by the command-line parameter args.excluded # These are the experiments we should ignore when reading from the source file excluded_experiments = set() rows = csv.DictReader(args.excluded, delimiter='\t') for row in rows: excluded_experiments.add(row['Experiment Accession']) # Load the contents of the file given by the command-line parameter args.scale. # This defines the suffix synonyms and symbols for various scaling indicators, # which must be noted during parsing rows = csv.DictReader(args.scale, delimiter='\t') suffixsymbs, suffixsyns = extract_suffix_syns_symbs_maps(rows) # Load the contents of the file given by the command-line parameter args.mappings. # This file associates gate laels with the ontology ids / keywords with which we populate the # 'Gating mapped to ontologies' column of the output file. rows = csv.DictReader(args.mappings, delimiter='\t') gate_mappings = {} for row in rows: gate_mappings[row['Label']] = row['Ontology ID'] # Load the contents of the file given by the command-line parameter args.special. # This file (similary to the args.mapping file) associates certain gate labels with ontology ids # but also contains additional information regarding these gates. rows = csv.DictReader(args.special, delimiter='\t') special_gates = {} for row in rows: special_gates[row['Label']] = { 'Ontology ID': row['Ontology ID'], 'Synonyms': row['Synonyms'], 'Toxic Synonym': row['toxic synonym'] } # Load the contents of the file given by the command-line parameter args.preferred. # This file associates ontology ids with preferred gate labels (i.e. pr#PRO-short-label). rows = csv.DictReader(args.preferred, delimiter='\t') preferred = {} for row in rows: preferred[row['Ontology ID']] = row['Preferred Label'] # Load the contents of the file given by args.cells. This is an OWL file in XML format. We first # parse it using python's xml library, and then call update_iri_maps_from_owl # to retrieve the maps: synonym_iris, iri_labels, iri_gates, and iri_parents tree = ET.parse(args.cells) iri_gates, iri_parents, iri_labels, synonym_iris = update_iri_maps_from_owl( tree) # Finally, load the contents of the source file, process each row and write the processed row # to a new file. rows = csv.DictReader(args.source, delimiter='\t') with open(args.output, 'w') as output: w = csv.writer(output, delimiter='\t', lineterminator='\n') # Write the header row: output_fieldnames = [ 'NAME', 'STUDY_ACCESSION', 'EXPERIMENT_ACCESSION', 'POPULATION_NAME_REPORTED', 'CL term', 'CL ID', 'CL definition', 'extra', 'POPULATION_DEFNITION_REPORTED', 'Population preferred name', 'Gating tokenized', 'Gating mapped to ontologies', 'Gating preferred definition', 'Conflicts', 'Conflict type' ] w.writerow(output_fieldnames) conflict_count = 0 symbols = suffixsymbs.values() for row in rows: # Ignore any rows describing excluded experiments. if row['EXPERIMENT_ACCESSION'] in excluded_experiments: continue # Tokenize and normalize the population name: extra = row['extra'].strip() tokenized_gates = tokenize('Standard', suffixsymbs, suffixsyns, extra) preferized_gates, ontologized_gates = normalize( tokenized_gates, gate_mappings, special_gates, preferred, symbols) # Determine the population preferred name: preferred_name = row['CL term'] or '' if preferred_name and preferized_gates: preferred_name += ' & ' + ', '.join(preferized_gates) row['Population preferred name'] = preferred_name # Determine the CL definition: population_gates = [] cell_type = re.sub('^CL:', 'http://purl.obolibrary.org/obo/CL_', row['CL ID']) if cell_type and cell_type in iri_gates: for gate in iri_gates[cell_type]: preferred_label = preferred.get(gate['kind']) if preferred_label: population_gates.append( preferred_label + get_iri_levels()[gate['level']]) row['CL definition'] = ', '.join(population_gates) # These will be needed later for determining conflicts: extra_gates = preferized_gates.copy() cell_gates = population_gates + preferized_gates # Tokenize and normalize the reported population definition, first removing any surrounding # quotation marks: reported = row['POPULATION_DEFNITION_REPORTED'].strip('"').strip( "'") tokenized_gates = tokenize(row['NAME'], suffixsymbs, suffixsyns, reported) row['Gating tokenized'] = ', '.join(tokenized_gates) preferized_gates, ontologized_gates = normalize( tokenized_gates, gate_mappings, special_gates, preferred, symbols) row['Gating mapped to ontologies'] = ', '.join(ontologized_gates) row['Gating preferred definition'] = ', '.join(preferized_gates) # Determine the conflicts: conflict_type = '' conflicts = [] for population_gate in cell_gates: for definition_gate in preferized_gates: pgate, plevel = split_gate(population_gate, symbols) dgate, dlevel = split_gate(definition_gate, symbols) ppos = plevel != '-' dpos = dlevel != '-' if pgate == dgate and ppos != dpos: conflicts.append(population_gate + '/' + dlevel) if population_gate in extra_gates: conflict_type = 'conflict with extra' else: conflict_type = 'conflict with CL definition' if len(conflicts) > 0: print(conflicts) conflict_count += 1 row['Conflicts'] = ', '.join(conflicts) row['Conflict type'] = conflict_type # Explicitly reference output_fieldnames here to make sure that the order in which the data # is written to the file matches the header order. w.writerow([row[fn] for fn in output_fieldnames]) print('Conflicts:', conflict_count)
def parse_query(query): """Parses a query into { term: freq } mapping""" return Counter(tokenize(query))