コード例 #1
0
ファイル: test_parsing.py プロジェクト: yujuyeon0511/gensim
 def test_stem_text(self):
     target = \
         "while it is quit us to be abl to search a larg " + \
         "collect of document almost instantli for a joint occurr " + \
         "of a collect of exact words, for mani search purposes, " + \
         "a littl fuzzi would help."
     self.assertEqual(stem_text(doc5), target)
コード例 #2
0
 def testStemText(self):
     target = \
         "while it is quit us to be abl to search a larg " + \
         "collect of document almost instantli for a joint occurr " + \
         "of a collect of exact words, for mani search purposes, " + \
         "a littl fuzzi would help."
     self.assertEqual(stem_text(doc5), target)
コード例 #3
0
def funcao_limpa_tudo(artigo):  #limpa a palavra, limita a palavra em uma
    lista_nova = []
    #logger.info('Setting it to 0, do not use it in your scoring function.')
    #print(docs)
    artigo = stem_text(artigo)
    artigo = split_alphanum(artigo)
    artigo = tokenizer.tokenize(
        artigo)  #artigo vira uma lista com todas as palavras

    list_artigo = list(artigo)  #pesquisar o que é o list
    try:
        for palavra in list_artigo:
            palavra = palavra.encode('utf-8')
            if re.match('^\d+$', palavra):
                #artigo.remove(palavra)
                pass
            elif palavra in stop_words:
                #artigo.remove(palavra)
                pass

            elif len(palavra) < 3:
                #artigo.remove(palavra)
                pass
            else:
                lista_nova.append(palavra)  #recebe palavra util
    except Exception as erro:  #evita travar o codigo e continua,
        numero_palavra = artigo.index(palavra)
        artigo.pop(numero_palavra)  #achou lixo

    del list_artigo  #garbage do python
    #log.info("Lista de Tokens: %s", docs)
    if len(lista_nova) < 5:
        return None
    else:
        return lista_nova
コード例 #4
0
ファイル: wordRank.py プロジェクト: meitiv/kaggle-covid19
 def __init__(self, text, section):
     self.text = text.strip()  # a string
     self.section = section.lower()
     self.tokens = [
         maybeLower(w) for w in stem_text(
             remove_stopwords(strip_punctuation2(self.text))).split()
     ]
     self.numTokens = len(self.tokens)
     self.weight = sectionWeights[self.section]
コード例 #5
0
def lemmatization_text(sentence,
                       allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """

	"""
    # break the sentence into list of tokens using gensim

    # form lemmatized words back to a sentence
    sentence = stem_text(sentence)

    return sentence
コード例 #6
0
def export(type_data='train'):
    print("Extracting data...")
    if type_data.lower() == 'train':
        filename = 'training.1600000.processed.noemoticon.csv'
    elif type_data.lower() == 'test':
        filename = 'testdata.manual.2009.06.14.csv'
    data_file = codecs.open('Sentiment140/' + filename, encoding='ISO-8859-1')
    data = []
    for tweet in data_file.read().split('\n')[:-1]:
        data.append(
            [string for string in tweet.split('"') if string not in ['', ',']])
    data_file.close()
    labels = [(float(tweet[0]) / 4.0) for tweet in data]
    tweets = [tweet[-1] for tweet in data]

    print("Preprocessing data...")
    for i, tweet in enumerate(tweets):
        new_tweet = ' '.join([word for word in tweet.split(' ') if len(word)\
                            > 0 and word[0] not in ['@', '#'] and 'http' not\
                            in word]).strip()
        pro_tweet = [
            word[:-3] if word[-3:] == 'xxx' else word
            for word in preprocess_string(new_tweet.replace('not', 'notxxx'))
        ]
        #pro_tweet = preprocess_string(new_tweet)
        if len(pro_tweet) < 2:
            tweets[i] = strip_punctuation(stem_text(new_tweet.lower())).\
                        strip().split()
        else:
            tweets[i] = pro_tweet
        sys.stdout.write("\r%d tweet(s) pre-processed out of %d\r" %
                         (i + 1, len(tweets)))
        sys.stdout.flush()

    print("\nCleaning data...")
    backup_tweets = np.array(tweets)
    backup_labels = np.array(labels)
    tweets = []
    labels = []
    for i, tweet in enumerate(backup_tweets):
        if len(tweet) >= 2:
            tweets.append(tweet)
            labels.append(backup_labels[i])
    del backup_tweets
    del backup_labels

    # Shuffle the dataset
    data = list(zip(tweets, labels))
    np.random.shuffle(data)
    tweets, labels = list(zip(*data))

    return (tweets, labels)
コード例 #7
0
def process_string(string, stemming=True, remove_stopwords=True):

    string = string.lower()
    abbreviations = re.findall(r'(?:[a-z]\.)+', string)
    for abbr in abbreviations:
        string = string.replace(abbr, abbr.replace('.', ''))
    string = pproc.strip_punctuation(string)
    if remove_stopwords:
        string = pproc.remove_stopwords(string)
    if stemming:
        string = pproc.stem_text(string)
    string = string.strip()
    return string
コード例 #8
0
def lemmatization_text(sentence,
                       allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """
	function to lemmatize text
	Args: sentence (str) - sentence to be lemmatized
	      allowed_postags (list) - allowed words postags
	Returns: sentence (str) - lemmatized sentence
	"""
    # break the sentence into list of tokens using gensim
    # list_tokens = gensim.utils.simple_preprocess(str(sentence))
    sentence = stem_text(sentence)

    return sentence
コード例 #9
0
def basic_preprocessing(
        text,
        sents=False,
        lower=False,
        stem=False,
        min_token_len=3,
        min_sent_len=4,
        remove_stops=False,
        stops=STOPWORDS,
        filters=['strip_multiple_whitespaces', 'strip_punctuation']):
    # EDT export specific
    text = text.replace('\x00', '')
    text = text.replace('\r\n', '\n')

    # note: filters will be applied in order
    if sents:
        sents = get_sentences(text)
    else:
        sents = [text]

    for s in sents:
        s = s.strip()
        if lower:
            s = s.lower()
        if stem:
            s = stem_text(s)

        for f in filters:
            s = funcs[f](s)

        # naive word tokenization
        s = s.split()
        tmp = list()
        for t in s:
            t = t.strip()
            if t:
                if remove_stops and stops:
                    if t not in stops:
                        tmp.append(t)
                    else:
                        continue
                else:
                    tmp.append(t)
            else:
                continue
        s = tmp

        if len(s) < min_sent_len:
            yield list()
        else:
            yield s
コード例 #10
0
    def get(self):
        args = parser.parse_args()
        query = stem_text(strip_punctuation2(remove_stopwords(
            args['query']))).split()
        results = []
        for paperID, paper in topPapers(query, 10):
            result = {}
            result['title'] = paper['metadata']['title']
            result['doi'] = paper['doi']
            result['sentences'] = [
                s['text'] for s in topSentences(paperID, query, 10)
            ]
            results.append(result)

        return results
コード例 #11
0
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        sentence_without_stops = remove_stopwords(sentence)
        sentence_without_stops = stem_text(sentence_without_stops)
        sentence_without_stops = strip_short(sentence_without_stops)
        sentence_without_stops = strip_punctuation(sentence_without_stops)

        proc_sentence = word_tokenize(sentence_without_stops.lower())

        if len(proc_sentence) == 0:
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    return [dictionary, proc_text, sentences]
コード例 #12
0
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        sentence_without_stops = remove_stopwords(sentence)
        sentence_without_stops = stem_text(sentence_without_stops)
        sentence_without_stops = strip_short(sentence_without_stops)
        sentence_without_stops = strip_punctuation(sentence_without_stops)

        proc_sentence = word_tokenize(sentence_without_stops.lower())

        if (len(proc_sentence) == 0):
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    return [dictionary, proc_text, sentences]
コード例 #13
0
def funcao_limpa_tudo(artigo):
    tokenizer = RegexpTokenizer(r'\w+')
    lista_nova = []
    #logger.info('Setting it to 0, do not use it in your scoring function.')
    #print(docs)
    artigo = stem_text(artigo)
    artigo = split_alphanum(artigo)
    artigo = tokenizer.tokenize(artigo)

    list_artigo = list(artigo)
    try:
        for palavra in list_artigo:
            palavra = palavra.encode('utf-8')
            if re.match('^\d+$', palavra):
                #artigo.remove(palavra)
                pass
            elif palavra in pt_stop:
                #artigo.remove(palavra)
                pass

            elif len(palavra) < 3:
                #artigo.remove(palavra)
                pass
            else:
                lista_nova.append(palavra)
    except Exception as erro:
        print(erro)
        #numero_palavra = artigo.index(palavra)
        #artigo.pop(numero_palavra) #achou lixo
        pass
    del list_artigo
    del artigo
    #log.info("Lista de Tokens: %s", docs)
    if len(lista_nova) < 5:
        return None
    else:
        return lista_nova
コード例 #14
0
    def prep_text_eng(self, text):
        # TODO optimalize
        res = preprocessing.strip_punctuation(text.lower())
        if self.settings['strip_nums']:
            res = preprocessing.strip_numeric(res)

        if self.settings[
                'use_lemmatizer']:  #TODO careful with using lemmatizer before removing stop words (performance)
            #res = " ".join([self.lemma.lemmatize(word, self.get_wordnet_pos(word)) for word in res.split() if len(word) > 2])
            res = " ".join(
                [self.lemma.lemmatize(word) for word in res.split()])

        if self.settings['remove_stop_words']:
            res = preprocessing.remove_stopwords(res)
            res = " ".join(word for word in res.split()
                           if word not in stp_wrds)

        if self.settings['strip_short']:
            res = preprocessing.strip_short(res, minsize=3)

        if self.settings['use_stemmer']:
            res = preprocessing.stem_text(res)
        # normalized = " ".join(lemma.lemmatize(word) for word in res.split())
        return res
コード例 #15
0
    def __iter__(self):

        with utils.smart_open(self.source) as fin:
            for idx, line in enumerate(itertools.islice(fin, self.limit)):
                print idx,
                print "\r",

                #line = re.sub(r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|'","",line)

                #Split words if they get stuffed together
                #line = re.sub( r"([A-Z][A-Z]*)", r" \1", line)

                #line = re.sub(r"\.(?=[^\d])|(?<=\w|\d)\(|(?<=\w|\d)\)"," ",line)
                #line = re.sub(r"(?<=\w[^\d])\.|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|'","",line)

                #NOTE split() will take care of extra spaces
                line = re.sub(
                    r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|:|\?|\;",
                    " ", line)

                #NOTE convert ages to numeric

                try:
                    line = re.sub(r"([a-z]+ (?=year-old))", fix_age, line)
                except:
                    pass

                line = remove_stopwords(line)
                #doc = filter(lambda word: word not in stopwords.words('english'),line.split(" ") )
                line = stem_text(line)
                line = line.split()
                line = self.trigram[self.bigram[line]]
                i = 0
                while i < len(line):
                    yield line[i:i + self.max_sentence_length]
                    i += self.max_sentence_length
コード例 #16
0
    def __iter__(self):
        with utils.smart_open(self.source) as fin:
            for item_no, line in enumerate(fin):

                #line = re.sub(r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|'","",line)

                #Split words if they get stuffed together
                #line = re.sub( r"([A-Z][A-Z]*)", r" \1", line)

                #line = re.sub(r"\.(?=[^\d])|(?<=\w|\d)\(|(?<=\w|\d)\)"," ",line)
                #line = re.sub(r"(?<=\w[^\d])\.|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|'","",line)

                label = " ".join(line.split(" ")[:10])

                #NOTE split() will take care of extra spaces
                line = re.sub(
                    r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|:|\?|;|>|<|=|%|\+|-",
                    " ", line)

                #NOTE convert ages to numeric
                '''    
                try:
                    line = re.sub(r"([a-z]+ (?=year-old))",fix_age,line)
                except:
                    pass
                '''
                line = remove_stopwords(line)

                line = stem_text(line).split()

                #Dropping the stop word calculation, Since phrases were calculated without it
                doc = line
                doc = self.trigram[self.bigram[doc]]

                #doc = [element.lower() for element in doc]
                yield TaggedDocument(doc, [label])
コード例 #17
0
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import remove_stopwords, stem_text

soup = BeautifulSoup(open("Question1.txt"), 'html.parser')
for junk in soup(["docno", "date", "author", "favorite"]):
    junk.decompose()
soup.prettify()
data = str(soup.get_text()).splitlines()
preprocessed_data = []
i = 0
for line in data:
    print(i)
    i += 1
    if line != '':
        sline = line.split("\t", 1)
        if len(sline) == 2:
            sline[1] = stem_text(remove_stopwords(sline[1]))
            preprocessed_data.append(simple_preprocess(sline[1]))
        else:
            sline[0] = stem_text(remove_stopwords(sline[0]))
            preprocessed_data.append(simple_preprocess(sline[0]))
model = Word2Vec(preprocessed_data, size=300, window=5, min_count=3, workers=8)
print(model.wv.most_similar(stem_text("amazed")))
コード例 #18
0
#print keywords(query_doc,words=8)

#query_doc = remove_stopwords(query_doc)
#query_doc = stem_text(query_doc).split()

bigram = Phraser.load('./preprocessed_big_phrases')
trigram = Phraser.load('./preprocessed_trigram_phrases')

col_1 = "The patient was a 47 year-old man, whose chief complaint was melena. He visited a nearby hospital, and further evaluation showed rectal cancer invading the prostate, with multiple lung and liver metastases. The clinical diagnosis was cT4b(prostate), cN1, cM1b(H2, PUL2), cStage IV . We performed colostomy in the transverse colon prior to chemotherapy. He was administered 1 course of mFOLFOX6 plus bevacizumab and 7 courses of FOLFOXIRI plus bevacizumab. The primary tumor showed PR. The liver metastases were localized and shrunken, while the lung metastases disappeared. Approximately 6 months after the start of chemotherapy, a laparoscopic total pelvic exenteration and ileal conduit were performed following the diagnosis of ycT4b(prostate), ycN1, ycM1a(H2), ycStage IV . About 3 months later, a partial resection of the left liver lobes(S1 and S5/S8)was performed laparoscopically. He has been cancer-free for 8 months."
col_1 = re.sub(
    r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|:|\?|\;",
    " ", col_1)

col_1 = remove_stopwords(col_1)
col_1 = stem_text(col_1).split()

col_1 = trigram[bigram[col_1]]

#print col_1
#inferred_docvec = model.infer_vector(trigram[bigram[new_q]],steps=5000)

col_2 = "Combined null modality therapy is sufficient to treat advanced rectal cancer with multiple metastases. Her, we report a case of long-term survival in a patient with multiple metastases from rectal cancer. A5 8-year-old man had previously undergone low anterior resection for advanced rectal cancer. Multiple liver and lung metastases were identified prior to operation; therefore, we initiated null chemotherapy (FOLFOX). Partial resection of metastatic lesions and radiofrequency ablation(RFA)were also administered, but newly developed liver, lung, and adrenal gland metastases were identified. We changed the chemotherapy null regimen and administered topical therapies(partial resection, RFA, hepatic arterial infusion null chemotherapy, null radiotherapy)for each chemotherapy refractory metastatic lesion. Although the patient is in a tumor bearing state, he is still alive 10 years after his first operation. This combined modality therapy is an option for patients with chemotherapy null refractory metastases from rectal cancer."
col_2 = re.sub(
    r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|:|\?|\;",
    " ", col_2)

col_2 = remove_stopwords(col_2)
col_2 = stem_text(col_2).split()

col_2 = trigram[bigram[col_2]]
コード例 #19
0
def stem_words(inStr):
	"""Stems words in text using gensim."""
	return stem_text(inStr)
コード例 #20
0
def _stem(doc):
    doc = stem_text(doc)
    return doc
コード例 #21
0
import numpy as np
from gensim.parsing.preprocessing import stem_text, remove_stopwords
bagOfWords = CountVectorizer()
tfIdfBow = TfidfVectorizer()
data = pd.read_csv("Question2 Dataset.tsv",
                   usecols=['sentiment', 'review'],
                   delimiter='\t',
                   dtype={
                       'sentiment': int,
                       'review': str
                   })

texts = data['review'].apply(
    lambda x: BeautifulSoup(x, 'html.parser').get_text())
labels = data['sentiment']
texts = texts.apply(lambda x: stem_text(remove_stopwords(x)))
bowVectors = bagOfWords.fit_transform(texts)
tfIdfVectors = tfIdfBow.fit_transform(texts)
traindata, testdata, trainlabel, testlabel = train_test_split(tfIdfVectors,
                                                              labels,
                                                              test_size=0.2)
traindata2, testdata2, trainlabel2, testlabel2 = train_test_split(
    bowVectors, labels, test_size=0.2)
model = MultinomialNB()
model.fit(traindata, trainlabel)
model2 = MultinomialNB()
model2.fit(traindata2, trainlabel2)
predictions = model.predict(testdata)
predictions2 = model2.predict(testdata2)
print("Accuracy TF-IDF: ", np.mean(predictions == testlabel) * 100)
print("Accuracy Bag of Words: ", np.mean(predictions2 == testlabel2) * 100)
コード例 #22
0
    print("{0}: {1}".format(*x))

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

# After removing stop words in their original form, we can convert all words into tokenized representations
# Optional step! This might work better without this step
ALLOW_STEMMED_REPR = False

if ALLOW_STEMMED_REPR:
    for i in range(len(texts)):
        for j in range(len(texts[i])):
            word = texts[i][j]
            word = stem_text(word) # do we want "porter-stemmed version" ?
            texts[i][j] = word

"""
DBG_freq_between = (10,40) # <a,b>
for term, freq in frequency.items():
    if freq >= DBG_freq_between[0] and freq <= DBG_freq_between[1]:
        print(freq, term)
"""

print(len(texts), "documents")
print("EXAMPLES: ")

for i in range(3):
    print(len(texts[i]), texts[i])
コード例 #23
0
def preprocessing():
    for doc in corpus:
        doc_new = strip_numeric(stem_text(doc))
        yield gensim.utils.tokenize(doc_new, lower=True)
コード例 #24
0
# initialize stemmer
#stemmer = snowballstemmer.EnglishStemmer()

# grab stopword list, extend it a bit, and then turn it into a set for later
#stop = stopwords.words('english')
#stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
#stoplist = stemmer.stemWords(stop)
#stoplist = set(stoplist)
#stop = set(sorted(stop + list(stoplist)))

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

#print(list(lemmatize("Hello World! record records going goes browser browsers browsing How is it going?! Nonexistentword, 21"  ) ))

wl = print(stem_text("escalate exfiltrate application"))

# remove characters and stoplist words, then generate dictionary of unique words
#data['text_data'].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',inplace=True,regex=True)
#wordlist = filter(None, " ".join(list(set(list(itertools.chain(*data['text_data'].str.split(' ')))))).split(" "))
#data['stemmed_text_data'] = [' '.join(filter(None,filter(lambda word: word not in stop, line))) for line in data['text_data'].str.lower().str.split(' ')]

#data.replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',inplace=True,regex=True)
#lemmatizer = WordNetLemmatizer()

file = open("C:\Ankit\output\output.txt", "w")

for line in data:
    line = strip_punctuation(strip_non_alphanum(strip_numeric(line.lower())))
    file.write(str(line.encode("utf-8")))
    #sentences.append(line.split())
コード例 #25
0
# In[32]:

#Part E
get_ipython().run_line_magic('pinfo', 'stem_text')
#Transform `s` into lowercase and stem it

# In[33]:

df['content_stem'] = df['content2']

# In[34]:

#apply stem_text
for i in range(0, len(df['content'])):
    regex = stem_text((str(df['content2'][i])))
    df['content_stem'][i] = regex

# In[35]:

#test
df['content_stem'][0]

# In[36]:

#Initializing spacy’s 'en' model
import spacy
sp = spacy.load('en_core_web_sm')

# In[37]:
コード例 #26
0
def pre_processing():
    for document in texts:
        doc = strip_numeric(stem_text(document))
        yield gensim.utils.tokenize(doc, lower=True)
コード例 #27
0
ファイル: joker.py プロジェクト: sergeyleepython/telegrambot
def preprocess_sentence(sentence):
    sentence = preprocessing.strip_punctuation(sentence)
    sentence = preprocessing.stem_text(sentence)
    sentence = preprocessing.remove_stopwords(sentence)
    sentence = sentence.split()
    return sentence
コード例 #28
0
ファイル: regex_tester.py プロジェクト: Abas-Khan/thesis
from gensim.parsing.preprocessing import stem_text

'''
stng ="this is 2.3. and this is just.and ?"
exp = "Modification of arginine and lysine in proteins with 2,4-pentanedione.Primary amines react 2,4-pentanedione pH 6-9 form enamines, N-alkyl-4-amino-3-penten-2-ones. The latter compounds readily regenerate primary amine low pH treatment hydroxylamine. Guanidine substituted guanidines react 2,4-pentanedione form N-substituted 2-amino-4,6-dimethylpyrimidines rate lower least factor 20 rate reaction 2,4-pentanedione primary amines. Selective modification lysine arginine side chains proteins readily achieved 2,4-pentanedione. Modification lysine favored reaction pH 7 short reaction times pH 9. Selective modification arginine achieved reaction 2,4-pentanedione long times pH 9, followed treatment protein hydroxylamine. The extent modification lysine arginine side chains readily measured spectrophotometrically. Modification lysozyme 2,4-pentanedione pH 7 results modification 3.8 lysine residues less 0.4 arginine residue 24 hr. Modification lysozyme 2,4-pentanedione pH 9 results modification 4 lysine residues 4.5 arginine residues 100 hr. Treatment modified protein hydroxylamine regenerated modified lysine residues caused change modified arginine residues. One arginine residue seems essential catalytic activity enzyme."
content = re.sub(r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )","",exp)
print content
'''

vocab_list = ['rs10795668', 'miR-135a', 'Lynch syndrome I', 'biopsy', 'disease', 'C18.8', 'FOLFIRI-CETUXIMAB', 'rs4939827', 'IIIB', 'colon carcinoma', 'outcome', 'transverse colon cancer', 'therapy resistance', 'CTNNB1', 'IIIA', 'rs1035209', 'family history', 'relapse free survival', 'p14', 'anastomosis', 'Cowden syndrome', 'oxaliplatin', 'MSI-H', 'bleeding', 'DNA Image Cytometry', 'CAPOX', 'weight loss', 'ICD', 'Endorectal MRI', 'aflibercept', 'argon', 'EGF', 'immunotherapy', 'physical activity', 'rs4925386', 'C18.0', 'side effects', 'disease subtypes', 'angiogenesis inhibitors', 'cloacogenic carcinoma', 'colonic neoplasms', 'CD29', 'dysplasia in inflammatory bowel disease', 'serrated polyposis', 'EpCAM', 'intestinal polyposis', 'rs1800469', 'CD44', 'miR-135b', 'G1n1317', 'rs34612342', 'symptoms', 'rectal cancer', 'ramucirumab', 'interstitial brachytherapy', 'VEGFA', 'tetraploid', 'MSI', 'RX', 'FAP', 'Array-CGH', 'miR-92', 'irinotecan', 'T4a-N2a-M0', 'adenomatous polyposis syndromes', 'colon cancer', 'radiofrequency ablation', 'hereditary nonpolyposis type 5', 'R2', 'microRNA markers', 'mucositis', 'RAS-MAPK', 'gardner syndrome', 'genes', 'neoadjuvant chemo', 'IIC', 'adjuvant chemo', 'double contrast barium enema', 'MGMT', 'smoking', 'euploid', 'tingling', 'cyramza', 'monoclonal antibodies', 'vomiting', 'appetite loss', 'nausea', 'C18.4', 'MLH1', 'miR-155', 'C18.6', 'IHC MSI markers', 'barium enema', 'hamartomatous polyposis syndromes', 'MSH6', 'response', 'biomarkers', 'D17S250', 'rs12603526', 'hereditary nonpolyposis', 'alcohol', 'PI3K', 'RTK', 'nausea', 'blood disorders', 'lack of physical exercise', 'follow-up', 'immune checkpoint inhibitors', 'pembrolizumab', 'transanal endoscopic microsurgery', 'weakness', 'colorectal cancer', 'rs10911251', 'polymerase proofreading-associated polyposis', 'IIB','DNA MSI test results', 'molecular features', 'descending  colon cancer', 'C18.5', 'T4b-N0-M0', 'hepatic artery infusion', 'molecular marker testing', 'rs1799977', 'predictive', 'p16', '18q AI expression', 'stereotactic', 'anus neoplasms', 'CD133', 'fever', 'IVB', 'good', 'colon Kaposi sarcoma', 'WNT', 'E1317Q', 'rs3802842', 'weak muscle', 'Tis-N0-M0', 'splenic flexure cancer', 'chemotherapy', 'targeted therapy', 'C18.7', 'Turcot syndrome', 'miR-21', 'rs4779584', 'adenosquamous colon carcinoma', 'pathways', 'upsetstomach', 'gender male', 'rs11169552', 'survival', 'rs459552', 'rs3217810', 'internal', 'overall survival', 'rectal bleeding', 'BRAF mutation', 'T1-N0-M0', 'external beam', 'PMS2 loss', 'blood based', 'Gardner syndrome', 'attenuated adenomatous polyposis coli', 'PTGS2', 'T2-N0-M0', 'ploidy status', 'genomic instability', 'bloody stools', 'progressive disease', 'hereditary nonpolyposis type 8', 'nervous system effects', 'headaches', 'stomach pain', 'five-year survival', 'local excision', 'types', 'hereditary nonpolyposis type 6', 'III', 'T1\xe2\x80\x93T2-N1/N1c-M0', 'therapy', 'hair loss', 'CEA', 'chemotherapy drugs', 'rs3824999', 'colon lymphoma', 'recurrence', 'ulcerative colitis', 'disease etiology', 'G2', 'apoptotic', 'IIIC', 'Any T -Any N-M1b', '0', 'high red meat diet', 'Juvenile polyposis syndrome', 'rs1800734', 'microscopy', 'dMMR', 'fitness', 'R0', 'MRI', 'skin irritation', 'leukopenia', 'NGS', 'systemic', 'desmoid disease', 'POLE', 'CTC', 'miR-211', 'IIA', 'rs12241008', 'malignancy', 'G13D', 'rs961253', 'age', 'hereditary mixed polyposis syndrome 2', 'DPYD', 'Epigenetic gene silencing', 'F594L', 'constipation', 'cologuard', 'hereditary colon cancer', 'T4b-N1\xe2\x80\x93N2-M0', 'poor', 'obesity', 'partial', 'regional', 'R1', 'thrombocytopenia', 'dMMR test', 'colon sarcoma', 'rs174550', 'peeling', 'rectum cancer', 'T1\xe2\x80\x93T2-N2b-M0', 'D2S123', 'rs4444235', 'laparoscopy', 'CIN markers', 'loss of balance', 'laser therapy', 'KRAS mutational testing', 'SNPs', 'liver metastasis', 'prognosis', 'rs1321311', 'CT', 'aneuploid', 'G12V', 'KRAS', 'rs36053993', 'MSI test', 'hereditary nonpolyposis type 4', 'APC', 'TIMP-1', 'G4', 'p53 expression', 'FDA approveddrugs', 'G12S', 'single specimen guaiac FOBT', 'combinations', 'neuropathy', 'MLH1 loss', 'endocavitary', 'fungal infection', 'hereditary nonpolyposis type 1', 'BRAF mutation test', 'anemia', 'CEA assay', 'colorectal neoplasms', 'polyploidy test', 'regorafenib', 'G1', 'DNA MSI markers', 'Peutz-Jeghers syndrome', 'adenomatous polyposis coli', 'rs10411210', 'EPCAM', 'colectomy', 'prognostic', 'autosomal recessive colorectal adenomatous polyposis', 'hereditary nonpolyposis type 3', 'rs158634', 'colonic L-cell glucagon-like peptide producing tumor', 'C20', 'metastatic colorectal cancer', 'XELIRI', 'burning', 'Hyperplastic Polyposis Syndrome', 'bevacizumab', 'rectosigmoid juction cancer', 'european', 'T2\xe2\x80\x93T3-N2a-M0', 'carbon dioxide', 'CD24', 'tumor MSI-H expression', 'colorectal adenocarcinoma', 'Any T- Any N-M1a', 'virtual colonoscopy', 'Crohn&apos;s disease', 'tenderness', 'diploid', 'T3\xe2\x80\x93T4a-N1/N1c-M0', 'PMS2', 'muscle pain', 'FOLFIRI-BEVACIZUMAB', 'rectal neoplasms', 'predictive biomarker', 'BRAF', 'NRASmutation', 'BAT25', 'PET', 'rs1042522', 'complete', 'CIN', 'sigmoid colon cancer', 'ascending colon cancer', 'radiation therapy','KRT20', 'mouth and throat sores', 'BAT26', 'APC mutations', 'DRE', 'colon leiomysarcoma', 'fatigue', 'RAS mutation test', 'C19','diagnosis', 'shaking', 'Lynch syndrome', 'C18.9', 'tyrosine kinase inhibitors', 'risk factors', 'CA 19-9', 'hMLH1', 'MSH2 loss','rs4813802', 'colostomy', 'screening', 'V600E', 'colon singlet ring adenocarcinoma', 'altered bowel habits', 'XELOX', 'IVA', 'II', 'stable disease', 'rs12309274', 'I', 'hereditary nonpolyposis type 7', 'lung metastasis', 'anal canal carcinoma', 'FU-LV', 'prognostic biomarker', 'colon small cell carcinoma', 'resectability', 'rs647161', 'Li-Fraumeni syndrome', 'Q61K', 'rs10936599', 'sexual issues', 'rs7758229', 'hepatic flexure cancer', 'proctectomy', 'clinical features', 'MSH2', 'DNA mismatch-repair', 'C18.2', 'MRT', 'cryosurgery', 'PIK3CA', 'hereditary mixed polyposis syndrome 1', 'oligodontia-colorectal cancer syndrome', 'SEPT9 methylation', 'FIT', 'lonsurf', 'exercise', 'pain', 'east asian', 'colonoscopy', 'adenomas', 'TGF-\xce\xb2', 'G12D', 'rs704017', 'surgery', 'Faecal M2-PK', 'polyploidy test results', 'MSH6 loss', 'inherited genetic disorders', 'Lgr5', 'KRAS mutation', 'submucosal invasivecolon adenocarcinoma', 'BMI', 'R classification', 'rs9929218', 'sigmoidoscopy', 'stem cell', 'MUTYH-associated polyposis', '5-FU', 'VEGF', 'T3\xe2\x80\x93T4a-N2b-M0', 'nonpolyposis syndrome', 'T1-N2a-M0', 'hyperthermia', 'high fat intake', 'type of care', 'G3', 'population based SNP', 'ALK', 'miR-92a', 'CD166', 'anal gland neoplasms', 'T4a-N0-M0', 'metastasis', 'D5S346', 'rs10849432', 'blistering', 'rs61764370', 'rs1801155', 'PLOD1', 'C18.3', 'optical colonoscopy', 'miR-31', 'rs16892766', 'IV', 'rectosigmoid cancer', 'panitumumab', 'T3-N0-M0', 'miR-17', 'GX', 'FISH', 'cognitive dysfunction', 'EGFR', 'rs1801166', 'prognostic factors', 'bladder irritation', 'acute myelocytic leukemia', 'TYMS', 'UICC staging', 'FOLFOX', 'lipomatous hemangiopericytoma', 'rs6691170', 'ALDH1', 'tumor budding', 'MUTYH', 'MSS', 'grade', 'attenuated familial adenomatous polyposis', 'colon adenocarcinoma', 'high sensitivity faecal occult blood test', 'Samson Gardner syndrome', 'colon mucinous adenocarcinoma', 'pMMR', 'TP53', 'G463V', 'capsule colonoscopy', 'colon squamous cell carcinoma', 'rectal irritation', 'C18.1', 'HRAS', 'CEACAM5', 'neodymium:yttrium-aluminum-garnet', 'cetuximab', 'FOLFIRI', 'rs6983267', 'MSI-L', 'C18']
bigrams = []
trigrams = []

new_vocab = []
for item in vocab_list:
    item = stem_text(item)
    new_vocab.append(item.replace(" ","_"))

print new_vocab    
'''
for item in vocab_list:
    print "Stemming the results"
    item = stem_text(item)
    #print item
    content = item.split(" ")
    if len(content) ==2:
        bigrams.append(content)
    elif len(content) ==3:
        item = item.replace(" ","_",1)
        trigrams.append(item.split(" "))
print trigrams
コード例 #29
0
ファイル: __init__.py プロジェクト: IKMLab/pan2020
def tokenize(tweet, stem=False):
    tweet = tweet.lower()
    tweet = preprocessing.strip_punctuation(tweet)
    if stem:
        tweet = preprocessing.stem_text(tweet)
    return nltk.word_tokenize(tweet)