Beispiel #1
0
def getVideos(root):
    ''' get all videos in the root directory and all its subdirectories or the 
    root if it is a video file'''
    videos = []
    if path.isfile(root):
        dirpath, nameWithExt = path.split(root)
        name, ext = path.splitext(nameWithExt)
        if ext[1:] in common.videoTypes:
            videos.append(
                    common.videoToDict(dirpath, name, common.tokenize(name)))
        else:
            raise Exception('File is not a known video format')
    else:
        for (dirpath, dirname, filenames) in os.walk(root):
            # get all files with extension from videoTypes, leading dot is 
            # stripped from the extension
            for f in filenames:
                name, ext = path.splitext(f)
                if ext[1:] in common.videoTypes:
                    videos.append(common.videoToDict(
                            dirpath, name, common.tokenize(name)))
    if len(videos) == 0:
        raise Exception(
                'No file with known video format found in the directory')
    return videos
    def transform(self, documents):

        documents = [tokenize(d) for d in documents]
        documents = [d[:self.max_page_size] for d in documents]
        documents = [' '.join(d) for d in documents]

        if self.encoding_type in ['tfidf', 'count', 'binary']:
            return self.vectorizer.transform(documents).toarray()
        if self.encoding_type == 'lda':
            documents_tokenized = [tokenize(i) for i in documents]
            other_corpus = [
                self.common_dictionary.doc2bow(i) for i in documents_tokenized
            ]
            results = []
            for i in other_corpus:
                result = self.vectorizer[i]
                result = vectorize_topic_models(result, self.num_of_topics)
                results.append(result)

            return np.array(results)
        if self.encoding_type in ['doc2vec']:
            documents_tokenized = [tokenize(i) for i in documents]

            results = []
            for i in documents_tokenized:
                if i:
                    try:
                        results.append(self.vectorizer[i][0])
                    except KeyError:
                        results.append([0 for _ in range(self.encoding_size)])
                else:
                    results.append([0 for _ in range(self.encoding_size)])

            return np.array(results)

        if self.encoding_type in ['fasttext']:
            documents_clean = [clean_text(i) for i in documents]

            results = []
            for i in documents_clean:
                if i:
                    results.append(self.vectorizer.get_sentence_vector(i))
                    # results.append(self.vectorizer[i])
                else:
                    results.append(
                        np.array([0 for _ in range(self.encoding_size)]))

            return np.array(results)
Beispiel #3
0
    def convert_format(self):
        sentences = []
        words = pkl.load(open("LM_corpura//%s//%s" % (cfg['lm_corpus'], cfg['corpus__dict_file']), 'rb'))
        word_dict = dict([(word, key) for key, word in enumerate(words, 1)])
        #word_dict = common.get_word_dict(self.conf['index2word_path'])

        if self.name == "pos_tagging":
            tags = set([tag for word, tag in treebank.tagged_words()])
            tag_index = {tag: idx for idx, tag in enumerate(tags, 1)}
            for sentence in treebank.tagged_sents():
                sent_words = [(word_dict[common.tokenize(w)], tag_index[t])
                              if common.tokenize(w) in word_dict else (0, tag_index[t])
                              for w, t in sentence]
                sentences.append(sent_words)

        return sentences
Beispiel #4
0
    def read_data(self, tagged_sentences):
        features = []
        tags = []

        for sentence in tagged_sentences:
            sent_words = [
                self.word_dict[common.tokenize(w)]
                if common.tokenize(w) in self.word_dict else len(self.words)
                for w, t in sentence
            ]
            sent_tags = [t for w, t in sentence]
            sent_representations = self.lm.predict(np.asarray(sent_words))
            sent_representations = np.squeeze(sent_representations, axis=1)
            features.append(sent_representations)
            tags.append(sent_tags)

        all_tags = set(np.concatenate(tags))

        tags_enum = [(tag, idx) for idx, tag in enumerate(all_tags)]
        tag_dict = dict(tags_enum)

        tags = [[tag_dict[tag] for tag in sent_tags] for sent_tags in tags]

        #features = np.concatenate(features)
        tags = np.asarray(tags)
        tags = keras.utils.to_categorical(tags)

        X_train, X_test, y_train, y_test = train_test_split(features,
                                                            tags,
                                                            test_size=0.2,
                                                            random_state=42)

        for size in [0.01, 0.05, 0.1, 0.2]:
            print("Using %d examples" % (int(X_train.shape[0] * size)))
            svm = SVC(kernel='linear')
            svm.fit(X_train[:int(X_train.shape[0] * size)],
                    y_train[:int(X_train.shape[0] * size)])
            score = svm.score(X_test, y_test)
            print("for %.2f%% of the data: %.2f%% accuracy" %
                  (100 * size, 100 * score))
Beispiel #5
0
def process_html(r_text, r_time, url, timestamp, file_name):
    if r_text:
        record = generate_link_dict(url)
        soup = BeautifulSoup(r_text, 'lxml')
        new_links = [i['href'] for i in soup.find_all('a', href=True)]
        new_abs_links = [i for i in new_links if is_link_external(i, record['netloc'])]
        record['page_external_links'] = str(new_abs_links)
        record['request_time'] = r_time
        record['request_timestamp'] = timestamp

        meta_data = get_meta_info_from_html(r_text)
        page_text = get_text_from_html(r_text)

        record['html_char_len'] = len(r_text)
        record['text_char_len'] = len(page_text)
        record['meta_char_len'] = len(meta_data)
        record['html_word_len'] = len(tokenize(r_text))
        record['text_word_len'] = len(tokenize(page_text))
        record['meta_word_len'] = len(tokenize(meta_data))

        with open(f'{dir_loc}/all_html_chunks/{file_name}.txt', 'a') as f:
            f.write(f'{url}{sep_char}{str(r_text).replace(sep_char, "")}' + "\n")
        with open(f'{dir_loc}/all_meta_chunks/{file_name}.txt', 'a') as f:
            f.write(f'{url}{sep_char}{str(meta_data).replace(sep_char, "")}' + "\n")
        with open(f'{dir_loc}/all_text_chunks/{file_name}.txt', 'a') as f:
            f.write(f'{url}{sep_char}{str(page_text).replace(sep_char, "")}' + "\n")

        record['file_name'] = str(file_name)
        record_df = pd.DataFrame.from_dict([record])
        record_df = record_df.set_index('url')

        while True:
            try:
                with sqlite3.connect(f'{dir_loc}/dbs/{db_name}') as conn_disk:
                    record_df.to_sql('websites', conn_disk, if_exists='append', index=True)
                break
            except sqlite3.OperationalError:
                time.sleep(5)
                print('db locked')
Beispiel #6
0
    def train(self, corpus_filename: str):
        word_freq = {}

        p = Path(corpus_filename)
        with open(corpus_filename) as f:
            parenstack = []
            words_pos = []
            line_number = 1

            for line in f:
                line_number += 1
                tokens = common.tokenize(line)
                tokens_in_node = []
                for token in tokens:
                    if token == '(':
                        parenstack.append('(')
                        tokens_in_node = []
                    elif token == ')':
                        parenstack.pop()
                        if len(tokens_in_node) == 2:
                            words_pos.append(tuple(tokens_in_node))
                            pos, word = tokens_in_node
                            if not word in word_freq:
                                word_freq[word] = {}
                                word_freq[word][pos] = 1
                            else:
                                if not pos in word_freq[word]:
                                    word_freq[word][pos] = 1
                                else:
                                    word_freq[word][pos] += 1
                                tokens_in_node = []
                    else:
                        tokens_in_node.append(token)

            p = Path(self.PICKLE_FILE)
            with p.open('wb') as output_file:
                pickle.dump(word_freq, output_file, pickle.HIGHEST_PROTOCOL)
Beispiel #7
0
print('辞書を読み込みました:', DICT_PATH)

tfidf = models.TfidfModel.load(TFIDF_MODEL_PATH)
print('TF-IDFモデルを読み込みました:', TFIDF_MODEL_PATH)

clf = joblib.load(SVC_MODEL_PATH)
print('SVM学習モデルを読み込みました:', SVC_MODEL_PATH)

print('')
print('予測したいニュースタイトルを入力してください...')
print('')

try:
    for line in sys.stdin:
        title = line.rstrip('\r\n')
        documents = [tokenize(title)]
        bow_corpus = [dictionary.doc2bow(doc) for doc in documents]
        tfidf_corpus = tfidf[bow_corpus]
        X = [
            matutils.corpus2dense([corpus], num_terms=len(dictionary)).T[0]
            for corpus in tfidf_corpus
        ]

        result = clf.predict(X)[0]
        print('-----')
        print('入力:', title)
        print('予測:', CATEGORIES[result])
        print('')
except KeyboardInterrupt:
    print('===== 終了 =====')
def test_normalize():
    suffixsymbs = {
        'high': '++',
        'medium': '+~',
        'low': '+-',
        'positive': '+',
        'negative': '-'
    }

    suffixsyns = {
        'high': 'high',
        'hi': 'high',
        'bright': 'high',
        'Bright': 'high',
        'bri': 'high',
        'br': 'high',
        '(high)': 'high',
        'medium': 'medium',
        'med': 'medium',
        'intermediate': 'medium',
        'int': 'medium',
        '(medium)': 'medium',
        'low': 'low',
        'lo': 'low',
        'LO': 'low',
        'dim': 'low',
        'di': 'low',
        '(low)': 'low',
        'positive': 'positive',
        'negative': 'negative'
    }

    gate_mappings = {
        'Alexa350': 'http://purl.obolibrary.org/obo/PR_001',
        'Alexa750': 'http://purl.obolibrary.org/obo/PR_002',
        'Annexin': 'http://purl.obolibrary.org/obo/PR_003',
        'B220-_live': 'http://purl.obolibrary.org/obo/PR_004',
        'CCR7': 'http://purl.obolibrary.org/obo/PR_005',
        'CD14': 'http://purl.obolibrary.org/obo/PR_006',
        'CD16': 'http://purl.obolibrary.org/obo/PR_007',
        'CD19': 'http://purl.obolibrary.org/obo/PR_008',
        'CD20': 'http://purl.obolibrary.org/obo/PR_009',
        'CD21': 'http://purl.obolibrary.org/obo/PR_010',
        'CD24': 'http://purl.obolibrary.org/obo/PR_011',
        'CD27': 'http://purl.obolibrary.org/obo/PR_012',
        'CD3': 'http://purl.obolibrary.org/obo/PR_013',
        'CD33': 'http://purl.obolibrary.org/obo/PR_014',
        'CD38': 'http://purl.obolibrary.org/obo/PR_015',
        'CD4': 'http://purl.obolibrary.org/obo/PR_016',
        'CD44': 'http://purl.obolibrary.org/obo/PR_017',
        'CD45RA': 'http://purl.obolibrary.org/obo/PR_018',
        'CD4_T_cells': 'http://purl.obolibrary.org/obo/PR_019',
        'CD56': 'http://purl.obolibrary.org/obo/PR_020',
        'CD69': 'http://purl.obolibrary.org/obo/PR_021',
        'CD8': 'http://purl.obolibrary.org/obo/PR_022',
        'CD94': 'http://purl.obolibrary.org/obo/PR_023',
        'CXCR5': 'http://purl.obolibrary.org/obo/PR_024',
        'doublet_excluded': 'http://purl.obolibrary.org/obo/PR_025',
        'ICOS': 'http://purl.obolibrary.org/obo/PR_026',
        'IFNg': 'http://purl.obolibrary.org/obo/PR_027',
        'IL2': 'http://purl.obolibrary.org/obo/PR_028',
        'live': 'http://purl.obolibrary.org/obo/PR_029',
        'Live_cells': 'http://purl.obolibrary.org/obo/PR_030',
        'Lymph': 'http://purl.obolibrary.org/obo/PR_031',
        'Lymphocytes': 'http://purl.obolibrary.org/obo/PR_032',
        'lymphocytes': 'http://purl.obolibrary.org/obo/PR_033',
        'Michael': 'http://purl.obolibrary.org/obo/PR_034',
        'NP_tet': 'http://purl.obolibrary.org/obo/PR_035',
        'PD1': 'http://purl.obolibrary.org/obo/PR_036',
        'Robert': 'http://purl.obolibrary.org/obo/PR_037',
        'singlets': 'http://purl.obolibrary.org/obo/PR_038',
        'small_lymphocyte': 'http://purl.obolibrary.org/obo/PR_039',
        'SSC': 'http://purl.obolibrary.org/obo/PR_040',
        'TNFa': 'http://purl.obolibrary.org/obo/PR_041',
        'Uninfected': 'http://purl.obolibrary.org/obo/PR_042',
        'viable': 'http://purl.obolibrary.org/obo/PR_043',
    }

    special_gates = {
        'Michael': {
            'Ontology ID': 'PR:034',
            'Synonyms': 'mike, mickey, mick',
            'Toxic Synonym': 'mikey'
        },
        'Robert': {
            'Ontology ID': 'PR:037',
            'Synonyms': 'rob, bob, bert',
            'Toxic Synonym': 'bobert'
        }
    }

    preferred = {
        'http://purl.obolibrary.org/obo/PR_001': 'Axexa350',
        'http://purl.obolibrary.org/obo/PR_002': 'Alexa750',
        'http://purl.obolibrary.org/obo/PR_003': 'Annexin',
        'http://purl.obolibrary.org/obo/PR_004': 'B220-_live',
        'http://purl.obolibrary.org/obo/PR_005': 'CCR7',
        'http://purl.obolibrary.org/obo/PR_006': 'CD14',
        'http://purl.obolibrary.org/obo/PR_007': 'CD16',
        'http://purl.obolibrary.org/obo/PR_008': 'CD19',
        'http://purl.obolibrary.org/obo/PR_009': 'CD20',
        'http://purl.obolibrary.org/obo/PR_010': 'CD21',
        'http://purl.obolibrary.org/obo/PR_011': 'CD24',
        'http://purl.obolibrary.org/obo/PR_012': 'CD27',
        'http://purl.obolibrary.org/obo/PR_013': 'CD3',
        'http://purl.obolibrary.org/obo/PR_014': 'CD33',
        'http://purl.obolibrary.org/obo/PR_015': 'CD38',
        'http://purl.obolibrary.org/obo/PR_016': 'CD4',
        'http://purl.obolibrary.org/obo/PR_017': 'CD44',
        'http://purl.obolibrary.org/obo/PR_018': 'CD45RA',
        'http://purl.obolibrary.org/obo/PR_019': 'CD4_T_cells',
        'http://purl.obolibrary.org/obo/PR_020': 'CD56',
        'http://purl.obolibrary.org/obo/PR_021': 'CD69',
        'http://purl.obolibrary.org/obo/PR_022': 'CD8',
        'http://purl.obolibrary.org/obo/PR_023': 'CD94',
        'http://purl.obolibrary.org/obo/PR_024': 'CXCR5',
        'http://purl.obolibrary.org/obo/PR_025': 'doublet_excluded',
        'http://purl.obolibrary.org/obo/PR_026': 'ICOS',
        'http://purl.obolibrary.org/obo/PR_027': 'IFNg',
        'http://purl.obolibrary.org/obo/PR_028': 'IL2',
        'http://purl.obolibrary.org/obo/PR_029': 'live',
        'http://purl.obolibrary.org/obo/PR_030': 'Live_cells',
        'http://purl.obolibrary.org/obo/PR_031': 'Lymph',
        'http://purl.obolibrary.org/obo/PR_032': 'Lymphocytes',
        'http://purl.obolibrary.org/obo/PR_033': 'lymphocytes',
        'http://purl.obolibrary.org/obo/PR_035': 'NP_tet',
        'http://purl.obolibrary.org/obo/PR_036': 'PD1',
        'http://purl.obolibrary.org/obo/PR_038': 'singlets',
        'http://purl.obolibrary.org/obo/PR_039': 'small_lymphocyte',
        'http://purl.obolibrary.org/obo/PR_040': 'SSC',
        'http://purl.obolibrary.org/obo/PR_041': 'TNFa',
        'http://purl.obolibrary.org/obo/PR_042': 'Uninfected',
    }

    reported = 'CD14-CD56-CD3+CD4+CD8-CD45RA+CCR7+'
    tokenized = tokenize('LaJolla', suffixsymbs, suffixsyns, reported)
    assert tokenized == [
        'CD14-', 'CD56-', 'CD3+', 'CD4+', 'CD8-', 'CD45RA+', 'CCR7+'
    ]
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:006-', 'PR:020-', 'PR:013+', 'PR:016+', 'PR:022-', 'PR:018+',
        'PR:005+'
    ]
    assert preferized == [
        'CD14-', 'CD56-', 'CD3+', 'CD4+', 'CD8-', 'CD45RA+', 'CCR7+'
    ]

    reported = 'CD3-, CD19+, CD20-, CD27hi, CD38hi'
    tokenized = tokenize('Emory', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['CD3-', 'CD19+', 'CD20-', 'CD27++', 'CD38++']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:013-', 'PR:008+', 'PR:009-', 'PR:012++', 'PR:015++'
    ]
    assert preferized == ['CD3-', 'CD19+', 'CD20-', 'CD27++', 'CD38++']

    reported = 'CD3-/CD19+/CD20lo/CD38hi/CD27hi'
    tokenized = tokenize('IPIRC', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['CD3-', 'CD19+', 'CD20+-', 'CD38++', 'CD27++']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:013-', 'PR:008+', 'PR:009+-', 'PR:015++', 'PR:012++'
    ]
    assert preferized == ['CD3-', 'CD19+', 'CD20+-', 'CD38++', 'CD27++']

    reported = 'CD21hi/CD24int'
    tokenized = tokenize('Watson', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['CD21++', 'CD24+~']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:010++', 'PR:011+~']
    assert preferized == ['CD21++', 'CD24+~']

    reported = 'Annexin negative'
    tokenized = tokenize('Ltest', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['Annexin-']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:003-']
    assert preferized == ['Annexin-']

    reported = 'CD3+ AND CD4+ AND small lymphocyte'
    tokenized = tokenize('VRC', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['CD3+', 'CD4+', 'small_lymphocyte']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:013+', 'PR:016+', 'PR:039']
    assert preferized == ['CD3+', 'CD4+', 'small_lymphocyte']

    reported = 'Lymphocytes and CD8+ and NP tet+'
    tokenized = tokenize('Ertl', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['Lymphocytes', 'CD8+', 'NP_tet+']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:032', 'PR:022+', 'PR:035+']
    assert preferized == ['Lymphocytes', 'CD8+', 'NP_tet+']

    reported = 'Activated T: viable/singlets/Lymph/CD3+'
    tokenized = tokenize('Stanford', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['viable', 'singlets', 'Lymph', 'CD3+']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:043', 'PR:038', 'PR:031', 'PR:013+']
    assert preferized == ['!viable', 'singlets', 'Lymph', 'CD3+']

    # TODO: Is this right?
    reported = 'CD14-CD33-/CD3-/CD16+CD56+/CD94+'
    tokenized = tokenize('Stanford', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['CD14-', 'CD33-', 'CD3-', 'CD16+', 'CD56+', 'CD94+']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:006-', 'PR:014-', 'PR:013-', 'PR:007+', 'PR:020+', 'PR:023+'
    ]
    assert preferized == ['CD14-', 'CD33-', 'CD3-', 'CD16+', 'CD56+', 'CD94+']

    # TODO: Is this right?
    reported = 'Live cells/CD4 T cells/CD4+ CD45RA-/Uninfected/SSC low'
    tokenized = tokenize('Mayo', suffixsymbs, suffixsyns, reported)
    assert tokenized == [
        'Live_cells', 'CD4_T_cells', 'CD4+', 'CD45RA-', 'Uninfected', 'SSC+-'
    ]
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:030', 'PR:019', 'PR:016+', 'PR:018-', 'PR:042', 'PR:040+-'
    ]
    assert preferized == [
        'Live_cells', 'CD4_T_cells', 'CD4+', 'CD45RA-', 'Uninfected', 'SSC+-'
    ]

    reported = 'B220- live,doublet excluded,CD4+ CD44highCXCR5highPD1high,ICOS+'
    tokenized = tokenize('New York Influenza', suffixsymbs, suffixsyns,
                         reported)
    assert tokenized == [
        'B220-_live', 'doublet_excluded', 'CD4+', 'CD44++', 'CXCR5++', 'PD1++',
        'ICOS+'
    ]
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:004', 'PR:025', 'PR:016+', 'PR:017++', 'PR:024++', 'PR:036++',
        'PR:026+'
    ]
    assert preferized == [
        'B220-_live', 'doublet_excluded', 'CD4+', 'CD44++', 'CXCR5++', 'PD1++',
        'ICOS+'
    ]

    reported = 'lymphocytes/singlets/live/CD19-CD14-/CD3+/CD8+/CD69+IFNg+IL2+TNFa+'
    tokenized = tokenize('New York Influenza', suffixsymbs, suffixsyns,
                         reported)
    assert tokenized == [
        'lymphocytes', 'singlets', 'live', 'CD19-', 'CD14-', 'CD3+', 'CD8+',
        'CD69+', 'IFNg+', 'IL2+', 'TNFa+'
    ]
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == [
        'PR:033', 'PR:038', 'PR:029', 'PR:008-', 'PR:006-', 'PR:013+',
        'PR:022+', 'PR:021+', 'PR:027+', 'PR:028+', 'PR:041+'
    ]
    assert preferized == [
        'lymphocytes', 'singlets', 'live', 'CD19-', 'CD14-', 'CD3+', 'CD8+',
        'CD69+', 'IFNg+', 'IL2+', 'TNFa+'
    ]

    reported = 'Alexa350 (high) + Alexa750 (medium)'
    tokenized = tokenize('Modeling Viral', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['Alexa350++', 'Alexa750+~']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:001++', 'PR:002+~']
    assert preferized == ['Axexa350++', 'Alexa750+~']

    reported = 'TNFa+IFNg-'
    tokenized = tokenize('Flow Cytometry Analysis', suffixsymbs, suffixsyns,
                         reported)
    assert tokenized == ['TNFa+', 'IFNg-']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:041+', 'PR:027-']
    assert preferized == ['TNFa+', 'IFNg-']

    reported = 'Mikeyhigh/RobLO/Alexa350 (high)/CD33+ý'
    tokenized = tokenize('Some Project', suffixsymbs, suffixsyns, reported)
    assert tokenized == ['Mikey++', 'Rob+-', 'Alexa350++', 'CD33+-']
    preferized, ontologized = normalize(tokenized, gate_mappings,
                                        special_gates, preferred,
                                        suffixsymbs.values())
    assert ontologized == ['PR:034++', 'PR:037+-', 'PR:001++', 'PR:014+-']
    assert preferized == ['Michael++', 'Robert+-', 'Axexa350++', 'CD33+-']
Beispiel #9
0
    def train(self, corpus_filename: str):
        """Calculates emission and transition model.

        t means tag, w means word.


        Emission:   P(w[i] | t[i]) =   C(t[i], w[i]) / C(t[i])
        Transition: P(t[i] | t[i-1]) = C(t[i-1], t[i]) / C(t[i-1])

        Where C(t) counts the occurrences of t
        """

        model = {}
        pos_words = {}

        p = Path(corpus_filename)
        with open(corpus_filename) as f:
            parenstack = [];
            bigrams = {}
            bigrams[self.END_STR] = {}
            word_counts = {}

            for line in f:
                tokens = common.tokenize(line)
                tokens_in_node = []
                for token in tokens:
                    if token == '(':
                        parenstack.append('(')
                        tokens_in_node = []
                    elif token == 'S':
                        prev = self.START_STR
                    elif token == ')':
                        parenstack.pop()
                        if not parenstack:
                            if not prev in bigrams[self.END_STR]:
                                bigrams[self.END_STR][prev] = 1
                            else:
                                bigrams[self.END_STR][prev] += 1

                        elif len(tokens_in_node) == 2:
                            pos, word = tokens_in_node

                            if pos != "-NONE-":
                                if not pos in bigrams:
                                    bigrams[pos] = {}

                                if not prev in bigrams[pos]:
                                    bigrams[pos][prev] = 1
                                else:
                                    bigrams[pos][prev] += 1

                                prev = pos

                                if not pos in pos_words:
                                    pos_words[pos] = {}
                                    pos_words[pos][word] = 1
                                else:
                                    if not word in pos_words[pos]:
                                        pos_words[pos][word] = 1
                                    else:
                                        pos_words[pos][word] += 1
                                    tokens_in_node = []

                                if not word in word_counts:
                                    word_counts[word] = 1
                                else:
                                    word_counts[word] += 1
                    else:
                        tokens_in_node.append(token)

            # normalize transition model
            for pos, d in bigrams.items():
                total = sum(d.values())
                for prev, count in d.items():
                    bigrams[pos][prev] = count / total

            # rare words are counted as a single word
            pos_words2 = copy.deepcopy(pos_words)

            for pos, d in pos_words.items():
                for word, count in d.items():
                    if count <= self.UNKNOWN_TRESHOLD:
                        pos_words2[pos].pop(word)
                        if not self.UNKNOWN_STR in pos_words2[pos]:
                            pos_words2[pos][self.UNKNOWN_STR] = 1
                        else:
                            pos_words2[pos][self.UNKNOWN_STR] += 1

            pos_words = pos_words2

            # normalize emission model
            for pos, d in pos_words.items():
                total = sum(d.values())
                for word, count in d.items():
                    pos_words[pos][word] = count / total

            model["bigrams"] = bigrams
            model["pos_words"] = pos_words

            p = Path(self.PICKLE_FILE)
            with p.open('wb') as output_file:
                pickle.dump(model, output_file, pickle.HIGHEST_PROTOCOL)
    def fit(self, documents):
        documents = [tokenize(d) for d in documents]
        documents = [d[:self.max_page_size] for d in documents]
        documents = [' '.join(d) for d in documents]

        if self.encoding_type in ['tfidf', 'count', 'binary']:

            if self.encoding_type == 'tfidf':
                self.vectorizer = CountVectorizer(
                    ngram_range=(self.min_n_gram, self.max_n_gram),
                    max_features=self.max_vocab_size,
                    binary=False,
                    max_df=self.max_df,
                    analyzer=self.tokenizer_level)
                self.vectorizer.fit(documents)
            if self.encoding_type == 'count':
                self.vectorizer = CountVectorizer(
                    ngram_range=(self.min_n_gram, self.max_n_gram),
                    max_features=self.max_vocab_size,
                    binary=False,
                    max_df=self.max_df,
                    analyzer=self.tokenizer_level)
                self.vectorizer.fit(documents)
            if self.encoding_type == 'binary':
                self.vectorizer = CountVectorizer(
                    ngram_range=(self.min_n_gram, self.max_n_gram),
                    max_features=self.max_vocab_size,
                    binary=False,
                    max_df=self.max_df,
                    analyzer=self.tokenizer_level)
                self.vectorizer.fit(documents)
            with open(self.save_file_loc, 'wb') as f:
                pickle.dump(self.vectorizer, f)
        if self.encoding_type == 'lda':
            documents_tokenized = [tokenize(i) for i in documents]
            self.common_dictionary = Dictionary(documents_tokenized)
            common_corpus = [
                self.common_dictionary.doc2bow(text)
                for text in documents_tokenized
            ]
            self.vectorizer = ldamodel.LdaModel(common_corpus,
                                                id2word=self.common_dictionary,
                                                num_topics=self.num_of_topics,
                                                passes=self.vectorizer_epochs)
            self.vectorizer.save(self.save_file_loc)
        if self.encoding_type == 'doc2vec':
            tagged_documents = [
                TaggedDocument(tokenize(doc), [i])
                for i, doc in enumerate(documents)
            ]
            self.vectorizer = Doc2Vec(tagged_documents,
                                      vector_size=self.encoding_size,
                                      window=2,
                                      min_count=1,
                                      workers=4,
                                      epochs=self.vectorizer_epochs,
                                      max_vocab_size=100000)
            self.vectorizer.delete_temporary_training_data(
                keep_doctags_vectors=True, keep_inference=True)
            self.vectorizer.save(self.save_file_loc)
        if self.encoding_type == 'fasttext':
            with open(self.fasttext_training_file_location, 'w') as f:
                for i in documents:
                    f.write(clean_text(i) + '\n')
            self.vectorizer = fasttext.train_unsupervised(
                self.fasttext_training_file_location,
                model=self.fasttext_algorithm,
                dim=self.encoding_size)
            self.vectorizer.save_model(self.save_file_loc)
Beispiel #11
0
parser = ArgumentParser()
parser.add_argument('-d', '--data', default='data')
args = parser.parse_args()

in_dir = join(args.data, 'raw/blogs/')
in_paths = glob(in_dir + "*xml")
out_dir = join(args.data, 'proc/blogs/')

x = []
y = []
for path in in_paths:
    if 'male' not in path:
        continue
    xml = read_xml(path)
    posts = xml.xpath(".//post/text()")
    is_female = 'female' in path
    for post in posts:
        encoding = tokenize(post)
        if encoding.shape[0] <= min_sequence_length:
            continue
        if encoding.shape[0] < sequence_length:
            encoding = pad(encoding, sequence_length)
        label = 1 if is_female else 0
        x.append(encoding[:sequence_length])
        y.append(label)

makedirs(out_dir, exist_ok=True)
np.save(join(out_dir, 'x.npy'), np.vstack(x))
np.save(join(out_dir, 'y.npy'), np.array(y))
Beispiel #12
0
from gensim import corpora, models, matutils
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.externals import joblib
from sklearn.svm import SVC
from common import CATEGORIES, DICT_PATH, TFIDF_MODEL_PATH, SVC_MODEL_PATH, tokenize, load_documents

titles, labels = load_documents()
print('教師データを読み込みました: titles=%s labels=%s' % (len(titles), len(labels)))
print('')

documents = [tokenize(title) for title in titles]
print('形態素解析によりトークン化しました: ドキュメント数=%s' % (len(documents)))
print('')

dictionary = corpora.Dictionary(documents)
dictionary.filter_extremes(no_above=0.7)
print('辞書を作成しました: ユニークトークン数=%s' % (len(dictionary)))
print('')

dictionary.save(DICT_PATH)
print('辞書を保存しました:', DICT_PATH)
print('')

bow_corpus = [dictionary.doc2bow(doc) for doc in documents]
print('BOWコーパスを作成しました:', bow_corpus[0])
print('')

tfidf = models.TfidfModel(bow_corpus)
print('TF-IDFモデルを作成しました')
print('')
Beispiel #13
0
def tokenizer(text):
    tokens = tokenize(text)
    tokens = stemwords(tokens)
    return tokens
Beispiel #14
0
        ['\nFrom:', '\nTo:', '- Original Message -', '-----------']
    ]
    reply_indices = [index for index in reply_indices if index > -1]
    if len(reply_indices) > 0:  # remove older messages, signatures, etc.
        text = text[:min(reply_indices) - 5]
    lines = text.split('\n')
    non_quote_lines = [line for line in lines
                       if not line.startswith('>')]  # remove quotes
    text = '\n'.join(non_quote_lines)
    for name in names:  # remove own name as indicator for gender (e.g. in signature)
        if len(name) > 2:
            text = sub(name, '', text, flags=IGNORECASE)
    text = text.replace(
        '?', ''
    )  # replace question mark, as it is also used for unknown characters and redacted text
    encoding = tokenize(text)
    if encoding.shape[0] <= min_sequence_length:
        continue
    if encoding.shape[0] < sequence_length:
        encoding = pad(encoding, sequence_length)
    x.append(encoding[:sequence_length])
    label = 1 if 'female' in gender else 0
    y.append(label)
    checksums.append(checksum)

x = np.vstack(x)
y = np.array(y)

makedirs(out_dir, exist_ok=True)
np.save(join(out_dir, 'x'), x)
np.save(join(out_dir, 'y'), y)
Beispiel #15
0
def predict(text):
    x = tokenize(text)
    with graph.as_default():
        probability = model.predict(np.array([x], ))[0][0]
    return probability
Beispiel #16
0
# a dictionary of the form { term: { docid: freq } }
term_dictionary = defaultdict(Counter)

all_doc_ids = sorted(map(int, os.listdir(input_directory)))
doc_size = dict()

for doc_id in all_doc_ids:
    print "Trying to index doc %s..." % doc_id
    filepath = os.path.join(input_directory, str(doc_id))

    with open(filepath) as input_file:
        document_content = input_file.read()
        unique_terms = 0

        for term in tokenize(document_content):
            if doc_id not in term_dictionary[term]:
                unique_terms += 1

            term_dictionary[term][doc_id] += 1

        doc_size[doc_id] = unique_terms


# Formats the posting list for a specific term
# - input: a posting of the form { doc_id: freq, doc_id: freq }
# - return: a formatted posting string with "doc_id:freq doc_id:freq"
def format_posting_list(posting):
    sorted_doc_ids = sorted(posting)

    posting_strings = []
Beispiel #17
0
def extract_grammar(filename: str, args):
    with open(filename, "r") as corpus_file:
        pos_stack = []
        paren_stack = []
        words_pos = defaultdict(dd)
        prev = None
        root = None
        rules = []
        terminals = set()

        for line in corpus_file:
            tokens = common.tokenize(line)

            for token in tokens:
                if token == "(":
                    paren_stack.append("(")
                elif token == ")":
                    while True:
                        if not paren_stack:
                            return "bad"
                        else:
                            el = paren_stack.pop()
                            if pos_stack:
                                pos_stack.pop()
                            if el == "(":
                                break
                    if not paren_stack:
                        save_rule(root, rules)
                        pos_stack = []
                        paren_stack = []
                        prev = None
                        root = None
                else:
                    if prev == "(":
                        node = Node(token)
                        if pos_stack:
                            pos_stack[-1].children.append(node)
                        else:
                            root = node
                        pos_stack.append(node)
                    elif prev != "-NONE-":
                        # previous token was a POS, token can only be a terminal
                        terminals.add(prev)
                        words_pos[token][prev] += 1

                prev = token

        # probabilities from frequencies
        for word, d in words_pos.items():
            total = sum(d.values())
            for pos, count in d.items():
                words_pos[word][pos] = count / total

        MOST_COMMON_COUNT = 1000
        sorted_rules = Counter(rules).most_common()

        for rule in sorted_rules[:MOST_COMMON_COUNT]:
            same_lhs = filter(lambda r: r[0].lhs == rule[0].lhs, sorted_rules)
            total = sum([r[1] for r in same_lhs])
            rule[0].prob = rule[1] / total

        grammar_rules = [rule[0] for rule in sorted_rules[:MOST_COMMON_COUNT]]
        grammar = Grammar(grammar_rules, terminals, words_pos)

        if args.mode == "pre":
            print("{:>5} | {}".format("count", "rule"))
            for rule in sorted_rules[:MOST_COMMON_COUNT]:
                print("{:>5} | {}".format(rule[1], rule[0]))

            sum_first = sum(
                [rule[1] for rule in sorted_rules[:MOST_COMMON_COUNT]])
            sum_rest = sum(
                [rule[1] for rule in sorted_rules[MOST_COMMON_COUNT:]])

            print("{} rules total:".format(len(rules)))
            print("{:>8} (first {})".format(sum_first, MOST_COMMON_COUNT))
            print("{:>8} (rest)".format(sum_rest))
            print("{:.3f}% coverage".format(100.0 * sum_first / len(rules)))
            assert len(rules) == sum_first + sum_rest
        else:
            for rule in grammar.rules:
                print(rule)

            p = Path(PICKLE_FILE)
            with p.open('wb') as output_file:
                pickle.dump(grammar, output_file, pickle.HIGHEST_PROTOCOL)

        return grammar
def main():
    # Define command-line parameters
    parser = argparse.ArgumentParser(
        description='Normalize cell population descriptions')
    parser.add_argument(
        'excluded',
        type=argparse.FileType('r'),
        help='a TSV file with experiment accessions to be ignored')
    parser.add_argument(
        'scale',
        type=argparse.FileType('r'),
        help='a TSV file with the value scale (e.g. high, low, negative)')
    parser.add_argument(
        'mappings',
        type=argparse.FileType('r'),
        help='a TSV file which maps gate labels to ontology ids/keywords')
    parser.add_argument(
        'special',
        type=argparse.FileType('r'),
        help='a TSV file containing extra information about a subset of gates')
    parser.add_argument(
        'preferred',
        type=argparse.FileType('r'),
        help='a TSV file which maps ontology ids to preferred labels')
    parser.add_argument('cells',
                        type=argparse.FileType('r'),
                        help='an OWL file for the Cell Ontology')
    parser.add_argument('source',
                        type=argparse.FileType('r'),
                        help='the source data TSV file')
    parser.add_argument('output', type=str, help='the output TSV file')

    # Parse command-line parameters
    args = parser.parse_args()

    # Load the contents of the file given by the command-line parameter args.excluded
    # These are the experiments we should ignore when reading from the source file
    excluded_experiments = set()
    rows = csv.DictReader(args.excluded, delimiter='\t')
    for row in rows:
        excluded_experiments.add(row['Experiment Accession'])

    # Load the contents of the file given by the command-line parameter args.scale.
    # This defines the suffix synonyms and symbols for various scaling indicators,
    # which must be noted during parsing
    rows = csv.DictReader(args.scale, delimiter='\t')
    suffixsymbs, suffixsyns = extract_suffix_syns_symbs_maps(rows)

    # Load the contents of the file given by the command-line parameter args.mappings.
    # This file associates gate laels with the ontology ids / keywords with which we populate the
    # 'Gating mapped to ontologies' column of the output file.
    rows = csv.DictReader(args.mappings, delimiter='\t')
    gate_mappings = {}
    for row in rows:
        gate_mappings[row['Label']] = row['Ontology ID']

    # Load the contents of the file given by the command-line parameter args.special.
    # This file (similary to the args.mapping file) associates certain gate labels with ontology ids
    # but also contains additional information regarding these gates.
    rows = csv.DictReader(args.special, delimiter='\t')
    special_gates = {}
    for row in rows:
        special_gates[row['Label']] = {
            'Ontology ID': row['Ontology ID'],
            'Synonyms': row['Synonyms'],
            'Toxic Synonym': row['toxic synonym']
        }

    # Load the contents of the file given by the command-line parameter args.preferred.
    # This file associates ontology ids with preferred gate labels (i.e. pr#PRO-short-label).
    rows = csv.DictReader(args.preferred, delimiter='\t')
    preferred = {}
    for row in rows:
        preferred[row['Ontology ID']] = row['Preferred Label']

    # Load the contents of the file given by args.cells. This is an OWL file in XML format. We first
    # parse it using python's xml library, and then call update_iri_maps_from_owl
    # to retrieve the maps: synonym_iris, iri_labels, iri_gates, and iri_parents
    tree = ET.parse(args.cells)
    iri_gates, iri_parents, iri_labels, synonym_iris = update_iri_maps_from_owl(
        tree)

    # Finally, load the contents of the source file, process each row and write the processed row
    # to a new file.
    rows = csv.DictReader(args.source, delimiter='\t')
    with open(args.output, 'w') as output:
        w = csv.writer(output, delimiter='\t', lineterminator='\n')
        # Write the header row:
        output_fieldnames = [
            'NAME', 'STUDY_ACCESSION', 'EXPERIMENT_ACCESSION',
            'POPULATION_NAME_REPORTED', 'CL term', 'CL ID', 'CL definition',
            'extra', 'POPULATION_DEFNITION_REPORTED',
            'Population preferred name', 'Gating tokenized',
            'Gating mapped to ontologies', 'Gating preferred definition',
            'Conflicts', 'Conflict type'
        ]
        w.writerow(output_fieldnames)

        conflict_count = 0
        symbols = suffixsymbs.values()
        for row in rows:
            # Ignore any rows describing excluded experiments.
            if row['EXPERIMENT_ACCESSION'] in excluded_experiments:
                continue

            # Tokenize and normalize the population name:
            extra = row['extra'].strip()
            tokenized_gates = tokenize('Standard', suffixsymbs, suffixsyns,
                                       extra)
            preferized_gates, ontologized_gates = normalize(
                tokenized_gates, gate_mappings, special_gates, preferred,
                symbols)

            # Determine the population preferred name:
            preferred_name = row['CL term'] or ''
            if preferred_name and preferized_gates:
                preferred_name += ' & ' + ', '.join(preferized_gates)
            row['Population preferred name'] = preferred_name

            # Determine the CL definition:
            population_gates = []
            cell_type = re.sub('^CL:', 'http://purl.obolibrary.org/obo/CL_',
                               row['CL ID'])
            if cell_type and cell_type in iri_gates:
                for gate in iri_gates[cell_type]:
                    preferred_label = preferred.get(gate['kind'])
                    if preferred_label:
                        population_gates.append(
                            preferred_label + get_iri_levels()[gate['level']])
            row['CL definition'] = ', '.join(population_gates)

            # These will be needed later for determining conflicts:
            extra_gates = preferized_gates.copy()
            cell_gates = population_gates + preferized_gates

            # Tokenize and normalize the reported population definition, first removing any surrounding
            # quotation marks:
            reported = row['POPULATION_DEFNITION_REPORTED'].strip('"').strip(
                "'")
            tokenized_gates = tokenize(row['NAME'], suffixsymbs, suffixsyns,
                                       reported)
            row['Gating tokenized'] = ', '.join(tokenized_gates)
            preferized_gates, ontologized_gates = normalize(
                tokenized_gates, gate_mappings, special_gates, preferred,
                symbols)
            row['Gating mapped to ontologies'] = ', '.join(ontologized_gates)
            row['Gating preferred definition'] = ', '.join(preferized_gates)

            # Determine the conflicts:
            conflict_type = ''
            conflicts = []
            for population_gate in cell_gates:
                for definition_gate in preferized_gates:
                    pgate, plevel = split_gate(population_gate, symbols)
                    dgate, dlevel = split_gate(definition_gate, symbols)
                    ppos = plevel != '-'
                    dpos = dlevel != '-'
                    if pgate == dgate and ppos != dpos:
                        conflicts.append(population_gate + '/' + dlevel)
                        if population_gate in extra_gates:
                            conflict_type = 'conflict with extra'
                        else:
                            conflict_type = 'conflict with CL definition'
            if len(conflicts) > 0:
                print(conflicts)
                conflict_count += 1
            row['Conflicts'] = ', '.join(conflicts)
            row['Conflict type'] = conflict_type

            # Explicitly reference output_fieldnames here to make sure that the order in which the data
            # is written to the file matches the header order.
            w.writerow([row[fn] for fn in output_fieldnames])

        print('Conflicts:', conflict_count)
Beispiel #19
0
def parse_query(query):
    """Parses a query into { term: freq } mapping"""
    return Counter(tokenize(query))