def _convert_item(self, item):
     """ Convert sentence to list of tokens """
     if self.pre_splitted:
         return item
     elif self.split:
         return any2unicode(item).split()
     else:
         return self.split_func(any2unicode(item))
Example #2
0
def bow_mail_body(txt, nlp):
    """
    args:
        - txt: raw text
        - nlp: a spacy engine
    """
    # to unicode & get rid of accent
    txt = deaccent(any2unicode(txt))
    # split according to reply forward (get rid of "entête")
    txt = "\n".join(re_fw_regex.split(txt))
    txt = txt.replace(">", " ")
    # split sentences
    sentences = sent_tokenize(txt)
    # tokenize + lemmatize + filter ?
    bow = []
    for sent in sentences:
        if REGEX:
            sent = " ".join(lower_upper_pat.split(sent))
            sent = " ".join(number_letter_pat.split(sent))
        doc = nlp(sent, parse=False, entity=False)
        for tok in doc:
            if (tok.lemma_ and not tok.is_punct and not tok.is_stop
                    and not tok.like_num and not tok.is_space
                    and not tok.like_url and len(tok) > 1 and not any(
                        (x in tok.orth_ for x in not_in_list))):
                if tok.orth_.startswith("-") or tok.orth_.endswith("-"):
                    bow.append(tok.lemma_.replace("-", ""))
                else:
                    bow.append(tok.lemma_)
    return bow
Example #3
0
def extract_names(txt, nlp, n_sentences=2):
    """
    Use the spacy entity engine to extract person names from a text
    args:
        - txt: raw text
        - nlp: a spacy engine
    return:
        - list of names as strings
    """
    # to unicode & get rid of accent
    txt = deaccent(any2unicode(txt))
    # split according to reply forward (get rid of "entête")
    txt = "\n".join(re_fw_regex.split(txt))
    txt = txt.replace(">", " ")
    # split sentences
    sentences = sent_tokenize(txt)
    # tokenize + lemmatize + filter ?
    bow = []
    for sent in sentences[:n_sentences]:
        if REGEX:
            sent = " ".join(lower_upper_pat.split(sent))
            sent = " ".join(number_letter_pat.split(sent))
        doc = nlp(sent, parse=False)
        for tok in doc:
            lemma = drop_digits(replace_punct(tok.lemma_))
            if (lemma and (tok.ent_type_ != 'PERSON') and not tok.is_punct
                    and not tok.is_stop and lemma not in extendedstopwords
                    and not tok.like_num and not tok.is_space
                    and not tok.like_url and len(lemma) > 1 and not any(
                        (x in tok.orth_ for x in not_in_list))):
                bow.append(lemma)
    return bow
Example #4
0
    def test_get_offsets_and_start_doctags_win(self):
        # Each line takes 7 bytes (including '\n' character which is actually '\r\n' on Windows)
        lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n']
        tmpf = get_tmpfile('gensim_doc2vec.tst')

        with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout:
            for line in lines:
                fout.write(utils.any2unicode(line))

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1)
        self.assertEqual(offsets, [0])
        self.assertEqual(start_doctags, [0])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2)
        self.assertEqual(offsets, [0, 14])
        self.assertEqual(start_doctags, [0, 2])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3)
        self.assertEqual(offsets, [0, 7, 21])
        self.assertEqual(start_doctags, [0, 1, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4)
        self.assertEqual(offsets, [0, 7, 14, 21])
        self.assertEqual(start_doctags, [0, 1, 2, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5)
        self.assertEqual(offsets, [0, 7, 14, 21, 28])
        self.assertEqual(start_doctags, [0, 1, 2, 3, 4])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6)
        self.assertEqual(offsets, [0, 0, 7, 14, 14, 21])
        self.assertEqual(start_doctags, [0, 0, 1, 2, 2, 3])
Example #5
0
    def test_get_offsets_and_start_doctags(self):
        # Each line takes 6 bytes (including '\n' character)
        lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n']
        tmpf = get_tmpfile('gensim_doc2vec.tst')

        with utils.open(tmpf, 'wb', encoding='utf8') as fout:
            for line in lines:
                fout.write(utils.any2unicode(line))

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1)
        self.assertEqual(offsets, [0])
        self.assertEqual(start_doctags, [0])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2)
        self.assertEqual(offsets, [0, 12])
        self.assertEqual(start_doctags, [0, 2])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3)
        self.assertEqual(offsets, [0, 6, 18])
        self.assertEqual(start_doctags, [0, 1, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4)
        self.assertEqual(offsets, [0, 6, 12, 18])
        self.assertEqual(start_doctags, [0, 1, 2, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5)
        self.assertEqual(offsets, [0, 6, 12, 18, 24])
        self.assertEqual(start_doctags, [0, 1, 2, 3, 4])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6)
        self.assertEqual(offsets, [0, 0, 6, 12, 18, 24])
        self.assertEqual(start_doctags, [0, 0, 1, 2, 3, 4])
Example #6
0
    def test_save_as_line_sentence_ru(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')]
        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.open(corpus_file, 'rb', encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
Example #7
0
    def test_save_as_line_sentence_ru(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')]
        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.smart_open(corpus_file, encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
Example #8
0
    def test_save_as_line_sentence_en(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')]

        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.open(corpus_file, 'rb', encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
Example #9
0
    def test_save_as_line_sentence_en(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')]

        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.smart_open(corpus_file, encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
Example #10
0
def get_block_of_tweets(filepath):
    block_of_tweets = ''
    with open(filepath) as tweetsfile:
        tweetreader = csv.reader(tweetsfile)
        metadata = tweetreader.next()
        for tweetrow in tweetreader:
            tweetstring = tweetrow[0]
            block_of_tweets += tweetstring
    clean_block_of_tweets = utils.any2unicode(block_of_tweets.replace('\n', ' ').replace('\t', ' '), errors='ignore')
    text = [word for word in clean_block_of_tweets.lower().split() if word not in stoplist]
    return text
Example #11
0
 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in itertools.islice(self.source, self.limit):
             line = utils.any2unicode(line, errors='replace').split()
             i = 0
             while i < len(line):
                 yield line[i: i + self.max_sentence_length]
                 i += self.max_sentence_length
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in itertools.islice(fin, self.limit):
                 line = utils.any2unicode(line, errors='replace').split()
                 i = 0
                 while i < len(line):
                     yield line[i: i + self.max_sentence_length]
                     i += self.max_sentence_length
Example #12
0
    def __iter__(self):
        """Iterate through the lines in the source.

        Yields
        ------
        tuple : (list[str], int)
            Tuple of list of string and index

        """
        with open(self.path, "rb") as f:
            for i, line in enumerate(f):
                yield (any2unicode(line).split(), i)
    def __iter__(self):
        """Iterate through the lines in the source.

        Yields
        ------
        :class:`~fse.inputs.IndexedSentence`
            IndexedSentence from `path` specified in the constructor.

        """
        with s_open(self.path, "rb") as f:
            for i, line in enumerate(f):
                yield IndexedSentence(any2unicode(line).split(), i)
    def getSimilarityMatrix(self,sf, recomUrls, userUrls):
        uu = utils()
        #create the corpus
        corpus = []
        for iindex,rowi in recomUrls.iterrows():
            ur1 = rowi['url']
            sf, sm1 = self.querySummary(sf,ur1)
            sm1 = genu.any2unicode(sm1)
            sm1 = uu.createTaggedDataForSummary(sm1)
            corpus.append(sm1)

        dictionary = corpora.Dictionary(corpus)
        corpusBow = [dictionary.doc2bow(text) for text in corpus]
        tfidf = models.TfidfModel(corpusBow)
        corpus_tfidf = tfidf[corpusBow]

        #create lsi model
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary)
        # corpus_lsi = lsi[corpus_tfidf]
        index = similarities.MatrixSimilarity(lsi[corpusBow])


        rCorpus = []
        for iindex,rowi in userUrls.iterrows():
            ur2 = rowi['url']
            sf, sm2 = self.querySummary(sf,ur2)
            sm2 = genu.any2unicode(sm2)
            sm2 = uu.createTaggedDataForSummary(sm2)
            rCorpus.append(sm2)

        # generate results
        vec_bow = [dictionary.doc2bow(text) for text in rCorpus]
        vec_lsi = lsi[vec_bow]
        sims = index[vec_lsi]

        """
        rows are user urls and columns are recommended urls
        """
        return  sims
Example #15
0
def preprocess_txt(raw_txt):
    """
    Preprocessing of raw txt before parsing with Spacy
    - deaccent, to unicode
    - split forward, redirect
    - replace the > of email reply
    - split lowerUpper
    - split letterNumber
    """
    txt = deaccent(any2unicode(raw_txt))
    txt = "\n".join(re_fw_regex.split(txt))
    txt = txt.replace(">", " ")
    txt = " ".join(lower_upper_pat.split(txt))
    txt = " ".join(number_letter_pat.split(txt))
    return txt
Example #16
0
    def test_cython_linesentence_readline_after_getting_offsets(self):
        lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n']
        tmpf = get_tmpfile('gensim_doc2vec.tst')

        with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout:
            for line in lines:
                fout.write(utils.any2unicode(line))

        from gensim.models.word2vec_corpusfile import CythonLineSentence

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5)
        for offset, line in zip(offsets, lines):
            ls = CythonLineSentence(tmpf, offset)
            sentence = ls.read_sentence()
            self.assertEqual(len(sentence), 1)
            self.assertEqual(sentence[0], utils.any2utf8(line.strip()))
Example #17
0
def predict_candidate(blob_of_tweets, k_neighbors, k_threshold):
    training_set = []
    for candidate_handle in candidate_handles:
        candidate_folder = candidate_supporter_tweets_folders[candidate_handle]
        dirlist = os.listdir(candidate_folder)
        dirlist = [file for file in dirlist if file.endswith('.csv')]
        shuffle(dirlist)
        for i in range(len(dirlist)):
            filepath = os.path.join(candidate_folder, dirlist[i])
            training_set.append(filepath)

    tfidf, index, dictionary, id_to_path_dict, corpus = create_tfidf_from_file()

    clean_block_of_tweets = utils.any2unicode(blob_of_tweets.replace('\n', ' ').replace('\t', ' '), errors='ignore')
    text = [word for word in clean_block_of_tweets.lower().split() if word not in stoplist]

    return classify_tfidf_knn(tfidf, dictionary, index, text, k_neighbors, id_to_path_dict, k_threshold)
Example #18
0
def process_page(page):
    """
    Preprocess a single periodical page, returning the result as
    a unicode string.

    Removes all non-alpha characters from the text.

    Args:
        page (str): Passes in the page object

    Returns:
        str: Content of the file, but without punctuation and non-alpha characters.
    """
    content = utils.any2unicode(page, 'utf8').strip()
    content = re.sub(r"[^a-zA-Z]", " ", content)

    return content
Example #19
0
def add_to_gensim_dictionary_and_corpus(dictionary, corpus, id_to_path_dict, path_to_tweets_csv):
    # corpus should be a list (of lists), and dictionary must be a gensim dictionary
    block_of_tweets = ''
    with open(path_to_tweets_csv) as tweetsfile:
        tweetreader = csv.reader(tweetsfile)
        try:
            metadata = tweetreader.next()
        # this may fall through if the CSV is empty for some reason
        except:
            return dictionary, corpus, id_to_path_dict
        for tweetrow in tweetreader:
            tweetstring = tweetrow[0]
            block_of_tweets += tweetstring
    clean_block_of_tweets = utils.any2unicode(block_of_tweets.replace('\n', ' ').replace('\t', ' '), errors='ignore')
    text = [word for word in clean_block_of_tweets.lower().split() if word not in stoplist]
    id_to_path_dict[len(corpus)] = path_to_tweets_csv
    dictionary.add_documents([text])
    corpus.append(dictionary.doc2bow(text))
    return dictionary, corpus, id_to_path_dict
Example #20
0
    def analyize(self,text):
        try:
            unitext = any2unicode(text, encoding='utf8', errors='strict')
        except:
            print ("Not utf-8")
            return []
        pass

        #convert to lower
        lowerText = unitext.lower()

        # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)'
        # very aggresive regex...removes puncs and digits..keeps only alphabetic words
        tokenizer = WhitespaceTokenizer()
        regexTokens = tokenizer.tokenize(lowerText)
        p_stemmer = PorterStemmer()
        stemmedTokens = [p_stemmer.stem(i) for i in regexTokens]

        stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1]
        return stemmedRemSingleLetterTokens
    def _process(self, s):
        """Takes in a string and returns either a string (no tokenization) or
        a list of strings (tokenized).
        """
        # TODO: Clarify contract return value for filtered strings
        # TODO: Add docs for: if a filter evaluates as False the string is discarded

        # Skip empty strings:
        if len(s) == 0:
            # Keep return type consistent
            if self.tokenizer:
                return []
            else:
                return ''

        # Normalize the encoding before anything else
        if self.normalize_encoding:
            s = any2unicode(s)

        # Apply sub, filter, sub
        for sub in self.pre_substitutions:
            s = sub(s)

        # If any of the filters return True, filter the string
        if any(f(s) for f in self.filters):
            # Keep return type consistent
            if self.tokenizer:
                return []
            else:
                return ''

        for sub in self.post_substitutions:
            s = sub(s)

        # Tokenize last.  If you want to process the tokens use another
        # Transformer
        if self.tokenizer and s:
            s = filter(None, self.tokenizer(s))

        return s
    def __getitem__(self, i):
        """ Returns the line indexed by i. Primarily used for 
        :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar`
        
        Parameters
        ----------
        i : int
            The line index used to index the file

        Returns
        -------
        str
            line at the current index

        """
        if not self.get_able:
            raise RuntimeError("To index the lines you must contruct with get_able=True")

        with open(self.path, "rb") as f:
            f.seek(self.line_offset[i])
            output = f.readline()
            f.seek(0)
            return any2unicode(output).rstrip()
Example #23
0
    
    print "begin"


    i2e={}
    index_fixe=[0]
    pretrainedFile="/media/data/datasets/models/new_arame/"+lang+".vec" if remote else "data/pretrained.txt"
        
    if remote:
        sys.path.insert(0, '/home/arame/hakken-api/models/')
        import model
        import utils
        max_voc=2500 if extract else 'inf'
        pretrained=model.model(pretrainedFile,max_voc=max_voc,decale=1)
        for word in words:
            if any2unicode(word) in pretrained.vocab:
                i=w2i[word]
                if False and len(word)>1:# and word[0:2]=="##":
                    index_fixe.append(i)
                i2e[i]=pretrained.getVector(word)
    else:
        import utils
        wordsModel,floatsModel=utils.loadModel(pretrainedFile)
        for word,oldi in wordsModel.items(): 
            if word in words:
                i=w2i[word]
                if len(word)>1:# and word[0:2]=="#":
                    index_fixe.append(i)
                i2e[i]=floatsModel[oldi]
            
    print "finish load pretrain"
Example #24
0
#               debug=True)

stop_words = get_custom_stop_words()

pruned_words, counters, total_words = Phrases.learn_vocab(
    sentences=LineSentence(unigram_sentences_path),
    max_vocab_size=800000000,
    common_terms=stop_words,
    progress_per=100)

counters = sorted(counters.items(),
                  key=lambda key_value: key_value[1],
                  reverse=True)

count = 0
for key, value in counters:
    count += 1
    print(any2unicode(key), value)
print(count)

bigram_model = Phrases(LineSentence(unigram_sentences_path),
                       max_vocab_size=800000000,
                       progress_per=100,
                       threshold=0.5,
                       min_count=100,
                       common_terms=stop_words,
                       scoring='npmi')
for sentence in LineSentence(unigram_sentences_path):
    bigram_sentence = u' '.join(bigram_model[sentence])
    print(bigram_sentence + '\n')
Example #25
0
def main():

    #NOTE I used tr -d to remove ' from the file
    #contents = TaggedPubMed('big_home_test.txt')
    contents = TaggedPubMed("age_fix.txt")

    # ma
    #vocab_list = ['rs10795668', 'mir-135a', 'lynch_syndrom_i', 'c18.8', 'folfiri-cetuximab', 'rs4939827', 'colon_carcinoma', 'transvers_colon_cancer', 'ctnnb1', 'rs1035209', 'p14', 'anastomosi', 'cowden_syndrom', 'oxaliplatin', 'msi-h', 'dna_imag_cytometri', 'capox', 'endorect_mri', 'aflibercept', 'argon', 'egf', 'rs4925386', 'c18.0', 'angiogenesi_inhibitor', 'cloacogen_carcinoma', 'colon_neoplasm', 'cd29', 'dysplasia_inflammatori_bowel_diseas', 'serrat_polyposi', 'epcam', 'intestin_polyposi', 'rs1800469', 'cd44', 'mir-135b', 'g1n1317', 'rs34612342', 'rectal_cancer', 'ramucirumab', 'interstiti_brachytherapi', 'vegfa', 'tetraploid', 'msi', 'rx', 'fap', 'array-cgh', 'mir-92', 'irinotecan', 't4a-n2a-m0', 'adenomat_polyposi_syndrom', 'colon_cancer', 'radiofrequ_ablat', 'hereditari_nonpolyposi_type_5', 'r2', 'microrna_marker', 'mucos', 'ras-mapk', 'gardner_syndrom', 'neoadjuv_chemo', 'adjuv_chemo', 'doubl_contrast_barium_enema', 'mgmt', 'euploid', 'tingl', 'cyramza', 'monoclon_antibodi', 'c18.4', 'mlh1', 'mir-155', 'c18.6', 'ihc_msi_marker', 'barium_enema', 'hamartomat_polyposi_syndrom', 'msh6', 'd17s250', 'rs12603526', 'hereditari_nonpolyposi', 'pi3k', 'rtk', 'immun_checkpoint_inhibitor', 'pembrolizumab', 'transan_endoscop_microsurgeri', 'colorect_cancer', 'rs10911251', 'polymeras_proofreading-associ_polyposi', 'descend_colon_cancer', 'c18.5', 't4b-n0-m0', 'hepat_arteri_infus', 'molecular_marker_test', 'rs1799977', 'p16', '18q_ai_express', 'stereotact', 'anu_neoplasm', 'cd133', 'colon_kaposi_sarcoma', 'wnt', 'e1317q', 'rs3802842', 'tis-n0-m0', 'splenic_flexur_cancer', 'c18.7', 'turcot_syndrom', 'mir-21', 'rs4779584', 'adenosquam_colon_carcinoma', 'rs11169552', 'rs459552', 'rs3217810', 'rectal_bleed', 'braf_mutat', 't1-n0-m0', 'extern_beam', 'pms2_loss', 'blood_base', 'gardner_syndrom', 'attenu_adenomat_polyposi_coli', 'ptgs2', 't2-n0-m0', 'ploidi_statu', 'genom_instabl', 'bloodi_stool', 'hereditari_nonpolyposi_type_8', 'hereditari_nonpolyposi_type_6', 't1\xe2\x80\x93t2-n1/n1c-m0', 'cea', 'rs3824999', 'colon_lymphoma', 'ulcer_coliti', 'diseas_etiolog', 'g2', 'apoptot', 'ani_t_-ani_n-m1b', 'juvenil_polyposi_syndrom', 'rs1800734', 'microscopi', 'dmmr', 'r0', 'ng', 'desmoid_diseas', 'ctc', 'mir-211', 'rs12241008', 'g13d', 'rs961253', 'ag', 'hereditari_mix_polyposi_syndrom_2', 'dpyd', 'epigenet_gene_silenc', 'f594l', 'constip', 'cologuard', 'hereditari_colon_cancer', 't4b-n1\xe2\x80\x93n2-m0', 'r1', 'thrombocytopenia', 'dmmr_test', 'colon_sarcoma', 'rs174550', 'rectum_cancer', 't1\xe2\x80\x93t2-n2b-m0', 'd2s123', 'rs4444235', 'laparoscopi', 'cin_marker', 'kra_mutat_test', 'snp', 'liver_metastasi', 'prognosi', 'rs1321311', 'ct', 'aneuploid', 'g12v', 'kra', 'rs36053993', 'msi_test', 'hereditari_nonpolyposi_type_4', 'apc', 'timp-1', 'g4', 'p53_express', 'fda_approveddrug', 'g12', 'singl_specimen_guaiac_fobt', 'neuropathi', 'mlh1_loss', 'endocavitari', 'hereditari_nonpolyposi_type_1', 'braf_mutat_test', 'cea_assai', 'colorect_neoplasm', 'polyploidi_test', 'regorafenib', 'g1', 'dna_msi_marker', 'peutz-jegh_syndrom', 'adenomat_polyposi_coli', 'rs10411210', 'epcam', 'colectomi', 'prognost', 'autosom_recess_colorect_adenomat_polyposi', 'hereditari_nonpolyposi_type_3', 'rs158634', 'colon_l-cell_glucagon-lik_peptid_produc_tumor', 'c20', 'metastat_colorect_cancer', 'xeliri', 'hyperplast_polyposi_syndrom', 'bevacizumab', 'rectosigmoid_juction_cancer', 't2\xe2\x80\x93t3-n2a-m0', 'cd24', 'tumor_msi-h_express', 'colorect_adenocarcinoma', 'ani_t-_ani_n-m1a', 'virtual_colonoscopi', 'crohn&apos;_diseas', 'diploid', 't3\xe2\x80\x93t4a-n1/n1c-m0', 'pms2', 'folfiri-bevacizumab', 'rectal_neoplasm', 'braf', 'nrasmut', 'bat25', 'rs1042522', 'cin', 'sigmoid_colon_cancer', 'ascend_colon_cancer', 'radiat_therapi', 'krt20', 'bat26', 'apc_mutat', 'dre', 'colon_leiomysarcoma', 'ra_mutat_test', 'c19', 'lynch_syndrom', 'c18.9', 'tyrosin_kinas_inhibitor', 'ca_19-9', 'hmlh1', 'msh2_loss', 'rs4813802', 'colostomi', 'v600e', 'colon_singlet_ring_adenocarcinoma', 'alter_bowel_habit', 'xelox', 'stabl_diseas', 'rs12309274', 'hereditari_nonpolyposi_type_7', 'lung_metastasi', 'anal_canal_carcinoma', 'fu-lv', 'prognost_biomark', 'colon_small_cell_carcinoma', 'resect', 'rs647161', 'li-fraumeni_syndrom', 'q61k', 'rs10936599', 'rs7758229', 'hepat_flexur_cancer', 'proctectomi', 'msh2', 'dna_mismatch-repair', 'c18.2', 'mrt', 'cryosurgeri', 'pik3ca', 'hereditari_mix_polyposi_syndrom_1', 'oligodontia-colorect_cancer_syndrom', 'sept9_methyl', 'lonsurf', 'colonoscopi', 'adenoma', 'tgf-\xce\xb2', 'g12d', 'rs704017', 'faecal_m2-pk', 'polyploidi_test_result', 'msh6_loss', 'inherit_genet_disord', 'lgr5', 'kra_mutat', 'submucos_invasivecolon_adenocarcinoma', 'r_classif', 'rs9929218', 'sigmoidoscopi', 'mutyh-associ_polyposi', 'vegf', 't3\xe2\x80\x93t4a-n2b-m0', 'nonpolyposi_syndrom', 't1-n2a-m0', 'hyperthermia', 'high_fat_intak', 'popul_base_snp', 'mir-92a', 'cd166', 'anal_gland_neoplasm', 't4a-n0-m0', 'd5s346', 'rs10849432', 'rs61764370', 'rs1801155', 'plod1', 'c18.3', 'optic_colonoscopi', 'mir-31', 'rs16892766', 'rectosigmoid_cancer', 'panitumumab', 't3-n0-m0', 'mir-17', 'gx', 'fish', 'cognit_dysfunct', 'egfr', 'rs1801166', 'acut_myelocyt_leukemia', 'tym', 'folfox', 'lipomat_hemangiopericytoma', 'rs6691170', 'aldh1', 'mutyh', 'mss', 'attenu_famili_adenomat_polyposi', 'colon_adenocarcinoma', 'high_sensit_faecal_occult_blood_test', 'samson_gardner_syndrom', 'colon_mucin_adenocarcinoma', 'pmmr', 'tp53', 'g463v', 'capsul_colonoscopi', 'colon_squamou_cell_carcinoma', 'rectal_irrit', 'c18.1', 'hra', 'ceacam5', 'neodymium:yttrium-aluminum-garnet', 'cetuximab', 'folfiri', 'rs6983267', 'msi-l', 'c18']

    #NOTE new list (probably is the same as the old one)
    vocab_list = [
        'rs10795668', 'mir-135a', 'lynch_syndrom_i', 'biopsi', 'diseas',
        'c18.8', 'folfiri-cetuximab', 'rs4939827', 'iiib', 'colon_carcinoma',
        'outcom', 'transvers_colon_cancer', 'therapi_resist', 'ctnnb1', 'iiia',
        'rs1035209', 'famili_histori', 'relaps_free_surviv', 'p14',
        'anastomosi', 'cowden_syndrom', 'oxaliplatin', 'msi-h', 'bleed',
        'dna_imag_cytometri', 'capox', 'weight_loss', 'icd', 'endorect_mri',
        'aflibercept', 'argon', 'egf', 'immunotherapi', 'physic_activ',
        'rs4925386', 'c18.0', 'side_effect', 'diseas_subtyp',
        'angiogenesi_inhibitor', 'cloacogen_carcinoma', 'colon_neoplasm',
        'cd29', 'dysplasia_in_inflammatori_bowel_diseas', 'serrat_polyposi',
        'epcam', 'intestin_polyposi', 'rs1800469', 'cd44', 'mir-135b',
        'g1n1317', 'rs34612342', 'symptom', 'rectal_cancer', 'ramucirumab',
        'interstiti_brachytherapi', 'vegfa', 'tetraploid', 'msi', 'rx', 'fap',
        'array-cgh', 'mir-92', 'irinotecan', 't4a-n2a-m0',
        'adenomat_polyposi_syndrom', 'colon_cancer', 'radiofrequ_ablat',
        'hereditari_nonpolyposi_type_5', 'r2', 'microrna_marker', 'mucos',
        'ras-mapk', 'gardner_syndrom', 'gene', 'neoadjuv_chemo', 'iic',
        'adjuv_chemo', 'doubl_contrast_barium_enema', 'mgmt', 'smoke',
        'euploid', 'tingl', 'cyramza', 'monoclon_antibodi', 'vomit',
        'appetit_loss', 'nausea', 'c18.4', 'mlh1', 'mir-155', 'c18.6',
        'ihc_msi_marker', 'barium_enema', 'hamartomat_polyposi_syndrom',
        'msh6', 'respons', 'biomark', 'd17s250', 'rs12603526',
        'hereditari_nonpolyposi', 'alcohol', 'pi3k', 'rtk', 'nausea',
        'blood_disord', 'lack_of_physic_exercis', 'follow-up',
        'immun_checkpoint_inhibitor', 'pembrolizumab',
        'transan_endoscop_microsurgeri', 'weak', 'colorect_cancer',
        'rs10911251', 'polymeras_proofreading-associ_polyposi', 'iib',
        'dna_msi_test_result', 'molecular_featur', 'descend_colon_cancer',
        'c18.5', 't4b-n0-m0', 'hepat_arteri_infus', 'molecular_marker_test',
        'rs1799977', 'predict', 'p16', '18q_ai_express', 'stereotact',
        'anu_neoplasm', 'cd133', 'fever', 'ivb', 'good',
        'colon_kaposi_sarcoma', 'wnt', 'e1317q', 'rs3802842', 'weak_muscl',
        'tis-n0-m0', 'splenic_flexur_cancer', 'chemotherapi', 'target_therapi',
        'c18.7', 'turcot_syndrom', 'mir-21', 'rs4779584',
        'adenosquam_colon_carcinoma', 'pathwai', 'upset_stomach',
        'gender_male', 'rs11169552', 'surviv', 'rs459552', 'rs3217810',
        'intern', 'overal_surviv', 'rectal_bleed', 'braf_mutat', 't1-n0-m0',
        'extern_beam', 'pms2_loss', 'blood_base', 'gardner_syndrom',
        'attenu_adenomat_polyposi_coli', 'ptgs2', 't2-n0-m0', 'ploidi_statu',
        'genom_instabl', 'bloodi_stool', 'progress_diseas',
        'hereditari_nonpolyposi_type_8', 'nervou_system_effect', 'headach',
        'stomach_pain', 'five-year_surviv', 'local_excis', 'type',
        'hereditari_nonpolyposi_type_6', 'iii', 't1\xe2\x80\x93t2-n1/n1c-m0',
        'therapi', 'hair_loss', 'cea', 'chemotherapi_drug', 'rs3824999',
        'colon_lymphoma', 'recurr', 'ulcer_coliti', 'diseas_etiolog', 'g2',
        'apoptot', 'iiic', 'ani_t_-ani_n-m1b', '0', 'high_red_meat_diet',
        'juvenil_polyposi_syndrom', 'rs1800734', 'microscopi', 'dmmr', 'fit',
        'r0', 'mri', 'skin_irrit', 'leukopenia', 'ng', 'system',
        'desmoid_diseas', 'pole', 'ctc', 'mir-211', 'iia', 'rs12241008',
        'malign', 'g13d', 'rs961253', 'ag',
        'hereditari_mix_polyposi_syndrom_2', 'dpyd', 'epigenet_gene_silenc',
        'f594l', 'constip', 'cologuard', 'hereditari_colon_cancer',
        't4b-n1\xe2\x80\x93n2-m0', 'poor', 'obes', 'partial', 'region', 'r1',
        'thrombocytopenia', 'dmmr_test', 'colon_sarcoma', 'rs174550', 'peel',
        'rectum_cancer', 't1\xe2\x80\x93t2-n2b-m0', 'd2s123', 'rs4444235',
        'laparoscopi', 'cin_marker', 'loss_of_balanc', 'laser_therapi',
        'kra_mutat_test', 'snp', 'liver_metastasi', 'prognosi', 'rs1321311',
        'ct', 'aneuploid', 'g12v', 'kra', 'rs36053993', 'msi_test',
        'hereditari_nonpolyposi_type_4', 'apc', 'timp-1', 'g4', 'p53_express',
        'fda_approv_drug', 'g12', 'singl_specimen_guaiac_fobt', 'combin',
        'neuropathi', 'mlh1_loss', 'endocavitari', 'fungal_infect',
        'hereditari_nonpolyposi_type_1', 'braf_mutat_test', 'anemia',
        'cea_assai', 'colorect_neoplasm', 'polyploidi_test', 'regorafenib',
        'g1', 'dna_msi_marker', 'peutz-jegh_syndrom', 'adenomat_polyposi_coli',
        'rs10411210', 'epcam', 'colectomi', 'prognost',
        'autosom_recess_colorect_adenomat_polyposi',
        'hereditari_nonpolyposi_type_3', 'rs158634',
        'colon_l-cell_glucagon-lik_peptid_produc_tumor', 'c20',
        'metastat_colorect_cancer', 'xeliri', 'burn',
        'hyperplast_polyposi_syndrom', 'bevacizumab',
        'rectosigmoid_juction_cancer', 'european', 't2\xe2\x80\x93t3-n2a-m0',
        'carbon_dioxid', 'cd24', 'tumor_msi-h_express',
        'colorect_adenocarcinoma', 'ani_t-_ani_n-m1a', 'virtual_colonoscopi',
        'crohn&apos;_diseas', 'tender', 'diploid',
        't3\xe2\x80\x93t4a-n1/n1c-m0', 'pms2', 'muscl_pain',
        'folfiri-bevacizumab', 'rectal_neoplasm', 'predict_biomark', 'braf',
        'nra_mutat', 'bat25', 'pet', 'rs1042522', 'complet', 'cin',
        'sigmoid_colon_cancer', 'ascend_colon_cancer', 'radiat_therapi',
        'krt20', 'mouth_and_throat_sore', 'bat26', 'apc_mutat', 'dre',
        'colon_leiomysarcoma', 'fatigu', 'ra_mutat_test', 'c19', 'diagnosi',
        'shake', 'lynch_syndrom', 'c18.9', 'tyrosin_kinas_inhibitor',
        'risk_factor', 'ca_19-9', 'hmlh1', 'msh2_loss', 'rs4813802',
        'colostomi', 'screen', 'v600e', 'colon_singlet_ring_adenocarcinoma',
        'alter_bowel_habit', 'xelox', 'iva', 'ii', 'stabl_diseas',
        'rs12309274', 'i', 'hereditari_nonpolyposi_type_7', 'lung_metastasi',
        'anal_canal_carcinoma', 'fu-lv', 'prognost_biomark',
        'colon_small_cell_carcinoma', 'resect', 'rs647161',
        'li-fraumeni_syndrom', 'q61k', 'rs10936599', 'sexual_issu',
        'rs7758229', 'hepat_flexur_cancer', 'proctectomi', 'clinic_featur',
        'msh2', 'dna_mismatch-repair', 'c18.2', 'mrt', 'cryosurgeri', 'pik3ca',
        'hereditari_mix_polyposi_syndrom_1',
        'oligodontia-colorect_cancer_syndrom', 'sept9_methyl', 'fit',
        'lonsurf', 'exercis', 'pain', 'east_asian', 'colonoscopi', 'adenoma',
        'tgf-\xce\xb2', 'g12d', 'rs704017', 'surgeri', 'faecal_m2-pk',
        'polyploidi_test_result', 'msh6_loss', 'inherit_genet_disord', 'lgr5',
        'kra_mutat', 'submucos_invas_colon_adenocarcinoma', 'bmi', 'r_classif',
        'rs9929218', 'sigmoidoscopi', 'stem_cell', 'mutyh-associ_polyposi',
        '5-fu', 'vegf', 't3\xe2\x80\x93t4a-n2b-m0', 'nonpolyposi_syndrom',
        't1-n2a-m0', 'hyperthermia', 'high_fat_intak', 'type_of_care', 'g3',
        'popul_base_snp', 'alk', 'mir-92a', 'cd166', 'anal_gland_neoplasm',
        't4a-n0-m0', 'metastasi', 'd5s346', 'rs10849432', 'blister',
        'rs61764370', 'rs1801155', 'plod1', 'c18.3', 'optic_colonoscopi',
        'mir-31', 'rs16892766', 'iv', 'rectosigmoid_cancer', 'panitumumab',
        't3-n0-m0', 'mir-17', 'gx', 'fish', 'cognit_dysfunct', 'egfr',
        'rs1801166', 'prognost_factor', 'bladder_irrit',
        'acut_myelocyt_leukemia', 'tym', 'uicc_stage', 'folfox',
        'lipomat_hemangiopericytoma', 'rs6691170', 'aldh1', 'tumor_bud',
        'mutyh', 'mss', 'grade', 'attenu_famili_adenomat_polyposi',
        'colon_adenocarcinoma', 'high_sensit_faecal_occult_blood_test',
        'samson_gardner_syndrom', 'colon_mucin_adenocarcinoma', 'pmmr', 'tp53',
        'g463v', 'capsul_colonoscopi', 'colon_squamou_cell_carcinoma',
        'rectal_irrit', 'c18.1', 'hra', 'ceacam5',
        'neodymium:yttrium-aluminum-garnet', 'cetuximab', 'folfiri',
        'rs6983267', 'msi-l', 'c18'
    ]

    #NOTE only Fixed_Multi-Tag model has this update!
    vocab_list = [any2unicode(element) for element in vocab_list]
    dim = 200
    win = 8
    neg = 10

    kwargs = {
        "sent": contents,
        "vocab": vocab_list,
        "dim": dim,
        "win": win,
        "min_cnt": 2,
        "neg": neg,
        "iter": 20,
        "tag_doc": contents
    }
    Dis2Vec(**kwargs).run_Dis2Vec()