Esempio n. 1
0
def informationgaincompare(doc, text1, text2):
    text1a = tokenize(text1)
    text2a = tokenize(text2)
    t1 = []
    t2 = []
    punctpattern = re.compile(r'[,;\'"\)\(}{\[\].!\?<>=+-/*\\:]+')
    for i in text1a:
        if i in stopwords.words('english') or punctpattern.match(i) != None:
            continue
        t1.append(i)
    for i in text2a:
        if i in stopwords.words('english') or punctpattern.match(i) != None:
            continue
        t2.append(i)
    doctokens = tokenize(doc)
    docwords = []
    for i in doctokens:
        if i in stopwords.words('english') or punctpattern.match(i) != None:
            continue
        docwords.append(i)
    count1 = 0
    for i in t1:
        count1 += docwords.count(i)
    count2 = 0
    for i in t2:
        count2 +=docwords.count(i)
    l = len(docwords)
    p1 = float(count1)/l
    p2 = float(count2)/l
    return (-p1*math.log(p1), -p2*math.log(p2))
Esempio n. 2
0
    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')  
              
        stopS = stopwords.words('spanish')
        
        palavrasChaves = [] 
        textoArtigo = []
        
        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                            if len(i) > 2:
                                textoArtigo.append(i)
        
        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)
        
        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]
        
        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0,len(items)):
            palavrasChaves.append(items[i][0])
            
        return palavrasChaves        
Esempio n. 3
0
def loadQueries(fileloc):
    setTags=set()
    global training_doc_count
    global set_of_tokens
    xml_data=open(fileloc,'r')
    buf=xml_data.readlines()
    xml_data.close()
    count = 10
    for line in buf:
        #if count < 0:
        #   break
        #count =count -1
        #print line
        match = re.search('<row(.*)Body="(.*)" OwnerUserId(.*)Title="(.*)"(.*)Tags="(.*)" Answer(.*)/>', line)
        if match:
            body=match.group(2)
            tokens_in_body = re.findall(r"[\w-]+", body,re.UNICODE)
            valid_tokens=filter(lambda x: x not in stopwords.words('english') and len(x) >= 3,tokens_in_body)
            title=match.group(4)
            tokens_in_title = re.findall(r"[\w-]+",title,re.UNICODE)
            valid_tokens_in_title=filter(lambda x: x not in stopwords.words('english') and len(x) >= 3, tokens_in_title)
            valid_tokens.extend(valid_tokens_in_title)
            tags=match.group(6)
            tokens_in_tags = re.findall(r"[\w-]+", tags,re.UNICODE)
            valid_tags=filter(lambda x: x not in stopwords.words('english') and len(x) >= 3, tokens_in_tags)
            #print valid_tokens
            #print valid_tags
            training_set_cluster[training_doc_count]=set(valid_tags)
            for x in valid_tags:
                setTags.add(x)
            add_values_to_dict(valid_tokens,training_doc_count)
            training_doc_count +=1
    print len(main_dict)
    print len(setTags)
    print len(document_freq_dict)
Esempio n. 4
0
    def extract_features(self, article, feats, threegram_sent_ppl, fourgram_sent_ppl, fivegram_sent_ppl, sixgram_sent_ppl, index = None):
      featureSet = {}
      articleWords = article.replace("<s>", "").replace("</s>", "").split()
      featureSet["articlelen"] = len(articleWords)
      fx_words = [word for word in articleWords if word.lower() in stopwords.words('english')]
      featureSet["fxwordcount"] = len(fx_words)/len(articleWords)
      non_words = [word for word in articleWords if word.isalpha() != True]
      featureSet["nonwordcount"] = len(non_words)/len(articleWords)
      content_words = [word for word in articleWords if word.lower() not in stopwords.words('english')]
      featureSet["contentwordcount"] = len(content_words)/len(articleWords)
      featureSet["uniquewords"] = len(set(articleWords))/len(articleWords)
      featureSet.update(feats)

      try:
        sents = [x for x in article.split("\n") if len(x) > 1]
        ppl_five = ppl_wrangling(sents, fivegram_sent_ppl)
        ppl_six = ppl_wrangling(sents, sixgram_sent_ppl)
        ppl_three = ppl_wrangling(sents, threegram_sent_ppl)
        ppl_four = ppl_wrangling(sents, fourgram_sent_ppl)
        featureSet["ppl-5"] = ppl_five
        featureSet["ppl-6"] = ppl_six
        featureSet["ppl-3"] = ppl_three
        featureSet["ppl-4"] = ppl_four
      except:
          pass

      featureSet.update(self.posTags(index, article))
      return featureSet
Esempio n. 5
0
def word_standardize(sentences): 	
    tokens = []
    sentences_st = []

    for sent in sentences:
        tokens.extend(word_tokenize(sent))
        sentences_st.append(word_tokenize(sent))
	
    words = tokens
    
    st = LancasterStemmer()

    words = [w.lower() for w in words]
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
    st_words = [st.stem(w) for w in words]

    sent_result = []
    for sent in sentences_st:
        sent = [w.lower() for w in sent]
        sent = [w for w in sent if not w in stopwords.words('english')]
        sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
        sent_result.append(sent)

    return st_words, sent_result
Esempio n. 6
0
def getBOW():
    
    predatelist, postdatelist = getDates()
    stpwrds = stopwords.words('english')
    path = './unique/posts'
    stpwrds = stopwords.words("english")
    idList = []
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.txt')]
    
    count = 1
    predoc = []
    postdoc = []
    for file in doclist:
        with open(file,'r') as posts:
            for line in posts:
                if parser.parse(line.split('\t')[1]).date() in predatelist:
                    predoc.append(line.split('\t')[-1].decode('utf-8','ignore'))
                elif parser.parse(line.split('\t')[1]).date() in postdatelist:
                    postdoc.append(line.split('\t')[-1].decode('utf-8','ignore')) 
    
    texts1 = [[word for word in document.lower().split() if word not in stpwrds] for document in predoc]
    texts2 = [[word for word in document.lower().split() if word not in stpwrds] for document in postdoc]             
    all_tokens_pre = sum(texts1, [])
    all_tokens_post = sum(texts1, [])
    tokens_once1 = set(word for word in set(all_tokens_pre) if all_tokens_pre.count(word) == 1)
    tokens_once2 = set(word for word in set(all_tokens_post) if all_tokens_post.count(word) == 1)
    texts1 = [[word for word in text if word not in tokens_once1 and word not in stpwrds and word.isalpha()] for text in texts1]
    texts2 = [[word for word in text if word not in tokens_once2 and word not in stpwrds and word.isalpha()] for text in texts2]
    return texts1, texts2
def feature_extractor(data):
    """Extract features from a relation for the classifier."""
    features = dict()
    lmtzr = WordNetLemmatizer()

    h2, h3, paragraph = data
    
    features['h2_' + h2.lower()] = True
    for word in h2.split(' '):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features['h2word_' + word.lower()] = True
    features['h_' + h2.lower()] = True
    for word in h2.split(' '):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features['hword_' + word.lower()] = True

    if h3 != None:    
        features['h3_' + h3.lower()] = True
        for word in h3.split(' '):
            if word.lower() not in stopwords.words('english') and len(word) > 1:
                features['h3word_' + word.lower()] = True
        features['h_' + h3.lower()] = True
        for word in h3.split(' '):
            if word.lower() not in stopwords.words('english') and len(word) > 1:
                features['hword_' + word.lower()] = True
        
    for word in nltk.wordpunct_tokenize(paragraph):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features[word] = True
            features['lower_' + word.lower()] = True
            features['lmtzr_' + lmtzr.lemmatize(word).lower()] = True
    return features
Esempio n. 8
0
def get_stopwords(include_trectext_syntax=True):
    ignore_words = ['<doc>', '</doc>', '<text>', '</text>']

    ignore_words.extend(stopwords.words('english'))
    ignore_words.extend(stopwords.words('dutch'))

    return set(ignore_words)
def remove_stopwords(lines,method=2):

    if method==0:
        # using nltk stopwords
        stopwords_list = set(stopwords.words("english"))
    elif method==1:
        # using klearn stopwords
        stopwords_list = list(text.ENGLISH_STOP_WORDS)
    elif method==2:
        stopwords_list =list(set(stopwords.words("english") + list(text.ENGLISH_STOP_WORDS)))
    else:
         raise ValueError('Method value should be [0-2]')

    without_sw_lines = []
    # run thru all lines
    for each_line in lines:
        a_line_without_sw = ''
        
        #tokenize each line
        tokens = each_line.split()
        
        # run thru all tokens
        for each_token in tokens:
            if each_token not in stopwords_list:
                a_line_without_sw = a_line_without_sw+' '+each_token
                
        #recreate the list all over                
        without_sw_lines.append(a_line_without_sw)
        
    return without_sw_lines
Esempio n. 10
0
def find_opinions(tokens, feature, feat, id):
    fg = 0
    for opinion in tokens:
        if opinion[0] == 'advmod' or opinion[0] == 'neg':
            if opinion[3].lower() in stopwords.words('english'):
                continue
            # endif
            if feature[1:3] == opinion[1:3]:
                fg = 1
                modifier_set.add(opinion[3])
                if id != -1:
                    mods[id].append(opinion[3])
                feat.write(
                    feature[3] + ' ' + feature[1] + ' ' +
                    opinion[3] + '\n')

            # endif
        # endif
        elif opinion[0] == 'dep':
            if opinion[3].lower() in stopwords.words('english'):
                continue
            # endif
            if feature[1:3] == opinion[1:3]:
                opinions_set.add(opinion[3])
                find_opinions(
                    tokens, ['nsubj', opinion[3], opinion[4], feature[3],
                             feature[4]], feat, -1)
        # endelif
    # endfor

    if fg == 0:
        feat.write(feature[3] + ' ' + feature[1] + '\n')
Esempio n. 11
0
def find_features(tokens, feat):
    i = 0
    for feature in tokens:
        if feature[0] == 'nsubj':
            if feature[3].lower() in stopwords.words('english'):
                continue
            if feature[1].lower() in stopwords.words('english'):
                continue
            if not valid_feature(tokens, feature):
                continue
            # endif
            mods.append([])
            features_set.add(feature[3])
            opinions_set.add(feature[1])
            find_opinions(tokens, feature, feat, len(mods) - 1)
            if i != 0:
                if tokens[i - 1][0] == 'nsubj' and tokens[i - 1][3:5] == feature[3:5]:
                    for mod in mods[len(mods) - 2]:
                        if mod not in mods[len(mods) - 1]:
                            mods[len(mods) - 1].append(mod)
                            feat.write(
                                feature[3] + ' ' + feature[1] + ' ' + mod + '\n')

        # endif
        i = i + 1
Esempio n. 12
0
	def extract_bigrams(self, text):

		text = self.remove_return_lines_and_quotes(text)
		bigrams = []

		st = PorterStemmer()
		stop = stopwords.words('english')

		more_stop_words = [
			'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
		stop = stopwords.words('english')
		stop = stop + more_stop_words

		tokens = st.stem(text)
		tokens = nltk.word_tokenize(tokens.lower())
		tokens = [i for i in tokens if i not in stop]
		tokens = [word for word in tokens if len(word) > 2]

		bigram_measures = nltk.collocations.BigramAssocMeasures()
		finder = BigramCollocationFinder.from_words(tokens)
		finder.apply_freq_filter(2)
		top_bigrams = finder.nbest(bigram_measures.pmi, 1000)

		for bg in top_bigrams:
			bg = " ".join(bg)
			tag = nltk.pos_tag([bg])[0]

			if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
				bigrams.append(tag[0])

		return bigrams
Esempio n. 13
0
def CosSim(a,b):
    cossim=0
    moda=0
    aa= [word for word in a if word not in stopwords.words()]
    bb= [word for word in b if word not in stopwords.words()]
    for i in aa:
       # print "into aa"
	   #sum of square values
        moda=moda + word_frequencies[i]*word_frequencies[i]
    moda=moda**(.5)
    modb=0
    for i in bb:
        #print "into bb"
        modb=modb + word_frequencies[i]*word_frequencies[i]
    modb=modb**(.5)
	#a.b iff equal
    for i in aa:
        for j in bb:
            if(i==j):
                cossim=cossim+(word_frequencies[i]* word_frequencies[j])
    if (moda*modb == 0.0):
        return 0
    else:
        cossim=cossim/(moda*modb)
        return cossim
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, option=False,lemma=True,hyperhypo=True, \
                stop=True):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    #ambiguous_word = lemmatize(ambiguous_word)
    # Get the signatures for each synset.

    ss_sign = simple_signature(ambiguous_word,lemma=True,hyperhypo=True)
    #print ss_sign
    for ss in ss_sign:
        related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + 
                                 ss.part_meronyms() + ss.part_holonyms() + 
                                 ss.similar_tos() + ss.substance_holonyms() + 
                                 ss.substance_meronyms()))
    
        try:
            signature = list([j for j in chain(*[i.lemma_names() for i in \
                      related_senses]) if j not in stopwords.words('english')])
        except:
            signature = list([j for j in chain(*[i.lemma_names for i in \
                      related_senses]) if j not in stopwords.words('english')])
    ss_sign[ss]+=signature
  
    context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign)
    return best_sense
Esempio n. 15
0
    def clean(self, raw):

        letters_only = re.sub("[^a-zA-Z#@]", " ", raw)

        words = letters_only.split()

        for i in range(0, len(words)):

            if "#" in words[i]:
                s = words[i].split('#')
                words[i] = '# '.join(s)
            if "@" in words[i]:
                s = words[i].split('@')
                words[i] = '@ '.join(s)
            if "http" in words[i]:
                s = words[i].split('http')
                words[i]= "http".join(s)


        total_stop_words = set(stopwords.words("english"))
        removed_stop_words = set(stopwords.words("english")[0:20])
        stop_words = total_stop_words - removed_stop_words
        content_words = [w for w in words if not w in stop_words]

        return " ".join(content_words)
def frequencounting4Up(Listings):
    """
	Get the keywords count and the rank of the keywords
	:param Listings: the input list of tweets
	:return: a list of tuple ranked by words counts
	"""
    MyCounter = Counter()

    chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
             '*', '(', ')', ' - ', '_', '+', '=', '@', ':', '\\', ',',
             ';', '~', '`', '<', '>', '|', '[', ']', '{', '}', '-', '"', '&amp;', 'rt']

    UpdatingChars = ['&amp;', 'rt', '', '#dctraffic', '#mdtraffic', '#vatraffic', 'amp', '-']

    # This section below will filter out the common english words and punctuations from the target tweets.
    for line in Listings:
        if type(line) is str:
            for word in line.strip().lower().split():
                if PunkRemovement(word.strip().lower()) not in UpdatingChars + stopwords.words(
                        'english') and not word.isdigit():
                    if len(word) > 1:
                        MyCounter[PunkRemovement(word.strip().lower())] += 1
        else:
            for word in line.text.decode('UTF-8').strip().lower().split():
                if PunkRemovement(word.strip().lower()) not in chars + stopwords.words('english'):
                    MyCounter[PunkRemovement(word.strip().lower())] += 1

    return MyCounter.most_common()
Esempio n. 17
0
def annotations_to_words(terms, dag, ipr_map, lower):
    """
    Converts a string of accesssions into a string of the corresponding english-text representations.
    """
    try:
        sws = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        sws = stopwords.words('english')

    if lower:
        sws = set([x.lower() for x in sws])
        case = string.lower
    else:
        sws = set([x.upper() for x in sws])
        case = string.upper

    go_terms = [t.upper() for t in terms if 'GO' in t.upper()]
    ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map]

    go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ')
    ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ')

    go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions]
    ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions]

    go_descriptions = [x for x in go_descriptions if case(x) not in sws]
    ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws]

    line = ' '.join(go_descriptions + ipr_descriptions)
    return line
def freqgen_word(word):
  connect(word)
  # get english stopwords
  stopen = stopwords.words('english')
  stopfr = stopwords.words('french')
  #stopsp = stopwords.words('spanish')

  query={}
  projection={"text":1}

  cursor = db.Tweetfind.find(query,projection)

  texts = pandas.Series(list(cursor))
  tokens = []

  for text in texts.values:
    tokens.extend([word.lower().strip(':;,#."-\'!') for word in text['text'].split()])
  filtered_tokens=[]
  st = ['&amp','&nbsp','it\'s','haven\'t','can\'t','don\'t','i\'m','i\'ve','i\'ll','i\'d','#','e','@','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','rt','(',')']
  for word in tokens:
    try:
      if (not word.decode('utf-8') in stopen) and (not word.decode('utf-8') in stopfr):
        if not word in st:  
          filtered_tokens.append(word.decode('utf-8'))
    except :
      pass
  freq_dist = nltk.FreqDist(filtered_tokens)
  print type(freq_dist)
  #print freq_dist.plot(25)
  return freq_dist
Esempio n. 19
0
def fuzzer(localstring, dbpstring):
	lwl = localstring.replace('-','').replace(',.', '').split()
	lfwl = [w for w in lwl if not w in stopwords.words('english')]
	dwl = dbpstring.replace('-','').split()
	dfwl = [w for w in dwl if not w in stopwords.words('english')]
	ratio = fuzz.token_sort_ratio(str(lfwl), str(dfwl))
	return ratio
Esempio n. 20
0
def removeStopWords(tokens, lang):
    filteredToken=tokens
    if lang =='en':
        filteredToken = [w for w in tokens if not w in stopwords.words('english')]
    elif lang =='es':
        filteredToken = [w for w in tokens if not w in stopwords.words('spanish')]
    return filteredToken
 def pre_process(self, text):
     for i in range(len(text)):
         text[i] = text[i].replace("-", " ")
         word_list = text[i].encode('ascii', 'ignore').lower().split(" ")
         processed_text = []
         count = 0
         for word in word_list:
             if word in stopwords.words('english'):
                 continue
             if re.match('@\w+', word):
                 continue
             if re.match('#\w+', word):
                 continue
             word = re.sub('[0-9]+', 'gotNumber', word)
             word = re.sub('http(s)?.+', 'gotURL', word)
             word = re.sub('[^a-zA-Z0-9]', ' ', word)
             words = word.split(' ')
             for w in words:
                 if w is not ' ' and len(w) > 1 and w not in stopwords.words('english'):
                     w = self.sno.stem(w)
                     processed_text.append(w)
                 count += 1
                 print  '. ',
                 if count == 11:
                     print ''
                     count = 0
         text[i] = processed_text
     print ''
     return text
Esempio n. 22
0
def lazy_stopword_filter(filename):
    exclude_punctuation = set(['[', ']', '{', '}', '(', ')', ',','!','?',';',':','<', '>'])
    stop_set = set(stopwords.words('english'))
    with open("../resources/stopwords.txt", 'r') as f:
	stop_set = stop_set | set((l.strip() for l in f.readlines()))
    outfile = sys.argv[2]
    text = open(filename, 'rb')
    reader = csv.DictReader(text, delimiter=',', quotechar='"')
    target = open(outfile, 'wb')
    fieldnames=['Id', 'Title', 'Body', 'Tags']
    writer = csv.DictWriter(target, fieldnames, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow(dict((fn, fn) for fn in fieldnames))
    for line in reader:
        # remove multiple spaces from all columns 
        for k,v in line.items():
            line[k] = ' '.join(v.split())
	str_to_write_title = ""
	for word in line["Title"].split():
	    word = ''.join(ch for ch in word if ch not in exclude_punctuation)
	    if word.lower() not in stopwords.words('english'):
		str_to_write_title = str_to_write_title + " " + word.lower()
	#print(str_to_write_title)	
	str_to_write_body = "" 
	
	body = html_tag_remover.cleanup_html(line["Body"])
        for word in body.split(): # simple tokenization
	    word = ''.join(ch for ch in word if ch not in exclude_punctuation)
            if word.lower() not in stop_set:
	        str_to_write_body = str_to_write_body + " " + word.lower()
	#print(str_to_write_body)

	writer.writerow({'Id': line["Id"], 'Title':str_to_write_title, 'Body': str_to_write_body, 'Tags':line["Tags"]})	
def clean_total_words(data):
    all_text=list()
    for i in range(len(data)):
        all_text.append(data[i]['text'])
    words=list()
    for i in range(len(all_text)):
        words.append(nltk.word_tokenize(all_text[i]))
    wordss= list(itertools.chain.from_iterable(words))
    word_after_clean=list()
    for i in range(len(words)):
        wordss[i]=wordss[i].lower()
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
    for i in range(len(wordss)):
        if wordss[i] not in stop_words:
            word_after_clean.append(wordss[i])
    word_clean=list()
    for i in range(len(word_after_clean)):
        if word_after_clean[i].isalpha()==True:
            word_clean.append(word_after_clean[i])
    word_clea=list()
    for i in range(len(word_clean)):
        word_clea.append(word_clean[i].lower())
    stop_words = set(stopwords.words('english'))
    word_c=list()
    for i in range(len(word_clea)):
        if word_clea[i] not in stop_words:
            word_c.append(word_clea[i])
    return(word_c)
Esempio n. 24
0
def evaluate_html(content, html_conf):
    fdist = FreqDist()
    if html_conf['usehtml'] == False:
        logging.info('Discarding HTML tags')
        return fdist
 
    logging.info("\tEvaluating HTML")
     
    # try with TITLE tag
    titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
    for title in titles:
        root = etree.fromstring(title)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['title']):
            fdist.update(stems)

    # try with H1 tag
    headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
    for header in headers:
        root = etree.fromstring(header)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['h1']):
            fdist.update(stems)

    return fdist
Esempio n. 25
0
    def build_from_text(self,input_file):
        f=open(input_file, "r" )
        all_text=f.read()
        f.close()
        tokens =nltk.word_tokenize(all_text)

        for i,w in enumerate(tokens):
            tokens[i]=self.lmtzr.lemmatize(tokens[i]).lower()

        
        for i in range(len(tokens)):
            # tokens[i]=self.lmtzr.lemmatize(tokens[i])
            if (tokens[i] in stopwords.words("english")):
                continue
            if (tokens[i] =="."):
                continue
            if self.no_meaning(tokens[i]):
                continue
            
            if tokens[i] not in self.dict:
                for k in range(self.word_variants):
                        self.vecCollection[k].append(0)
                self.dict[tokens[i]]=self.word_variants
                self.inverse_dict.append(tokens[i])
                self.word_variants+=1
                self.frequencies.append(0)
                self.vecCollection.append([0,]*self.word_variants)
                
            self.frequencies[self.dict[tokens[i]]]+=1
            
            for j in range (i+1,i+window_length+1):
                if j>=len(tokens):
                    break
                if (tokens[j] =="."):
                    break
                if (tokens[j] in stopwords.words("english")):
                    continue
                
                if self.no_meaning(tokens[j]):
                    continue
            
              
                if tokens[j] not in self.dict:
                    for k in range(self.word_variants):
                        self.vecCollection[k].append(0)

                    self.dict[tokens[j]]=self.word_variants
                    self.inverse_dict.append(tokens[j])
                    self.word_variants+=1
                    self.vecCollection .append([0,]*self.word_variants)
                    self.frequencies.append(0)
                
              
                self.vecCollection[self.dict[tokens[i]]][self.dict[tokens[j]]]+=1
                self.vecCollection[self.dict[tokens[j]]][self.dict[tokens[i]]]+=1
        #f.close()
        
        del(all_text)
        del(tokens)
def preprocessQuery(query):
    query = query.lower()
    query = re.sub(r'[^a-z0-9 ]',' ',query)
    wordListAll = wordpunct_tokenize(query);
    # Now combine wordList with operators also
    # So, wordList now contains (word, Operator) before we do stop word removal
    wordList = []
    i = 0
    while i < len(wordListAll):
        if wordListAll[i] == "AND" or wordListAll[i] == "and":
            wordList.append( (wordListAll[i+1], "AND") )
            i += 2
        elif wordListAll[i] == "OR" or wordListAll[i] == "or":
            wordList.append( (wordListAll[i+1], "OR") )
            i += 2
        else:
            wordList.append( (wordListAll[i], "OR") )
            i += 1

    # Filter the words and remove stop words.
    filteredWords = [w for w in wordList if not w[0] in stopwords.words('english')]

    queryTuples = []
    queryLen = len(filteredWords)
    if queryLen > 15:
        queryTuples = filteredWords
    else:
        for word, operator in filteredWords:
            synonymList = getSynonymList(word)
            queryTuples.append((word, operator))
            for synCount, syn in enumerate(synonymList):
                if synCount > 3:
                    break
                # Adding operator OR in synonyms list if its not a stop word
                syn = re.sub(r'[^a-z0-9 ]', ' ', syn)
                synList = syn.split()
                for synOneTerm in synList:
                    if not synOneTerm in stopwords.words('english'):
                        queryTuples.append((synOneTerm, "OR"))


    # queryTuples list is ready (filtered). Now need to stem this list,
    # ensuring no duplicacy, same order and operator values
    finalQueryList = []
    porterStemmer = PorterStemmer()
    for word, operator in queryTuples:
        finalQueryList.append( (porterStemmer.stem(word), operator) )

    # Now removing duplicate items from list
    seenSet = set()
    uniqueList = []
    for q in finalQueryList:
        stemWord = q[0]
        if stemWord in seenSet:
            continue
        uniqueList.append(q)
        seenSet.add(q[0])

    return uniqueList
Esempio n. 27
0
def tf_idf(review1, review2):
	def get_tokens(document):
		#remove the punctuation using the character deletion step of translate
		#no_punctuation = document.translate(None, string.punctuation)
		tokens = nltk.word_tokenize(document)
		return tokens

	def tokenize(text):
		tokens = nltk.word_tokenize(text)
		stems = stem_tokens(tokens, stemmer)
		return stems
	
	def stem_tokens(tokens, stemmer):
		stemmed = []
		for item in tokens:
			stemmed.append(stemmer.stem(item))
		return stemmed

	#document1 = ("I think this is one of the higher crazy selection end Chinese restaurants in Michigan, one of the best that I've ever been to in the US. The seating is great, with professional waiters and waitresses. My boyfriend and I went to this place to have a taste of their famous Peking Duck, and the dish turned out to be really amazing! I like how they came to your desk and serve directly to your plate for the first round, and the second round comes out very quickly too. The shrimp dumplings are great too, much better than the usual ones you may get at a Dim Sun place. Of course the price is a bit higher too. ")
	#document2 = ("Yummms.coms!! This is some good Chinese food!! They have a full bar, great selection super good location right by my house. My only issue is cost...beer is like 8 bucks and they did not have my favorite the night I was there and it's was a Saturday night. Look I know this is not Chicago...I should not hold them to that standard but dang....Chinese bar should have Tsingtao coming out of their ears on Saturdays. ")	
	#print document1
	#token1 = get_tokens(document1.lower()) 
	#token2 = get_tokens(document2.lower()) 
	#print token1
	token1 = get_tokens(review1) 
	token2 = get_tokens(review2) 
	count1 = Counter(token1)
	#count1=Counter(review1)
	#count2=Counter(review2)
	#print count1.most_common(10)
	count2 = Counter(token2)
	#print count2.most_common(10)
	#print "\n"

	filtered1 = [w for w in token1 if not w in stopwords.words('english')]
	count_filter1 = Counter(filtered1)
	#print count_filter1.most_common(10)
	filtered2 = [w for w in token2 if not w in stopwords.words('english')]
	count_filter2 = Counter(filtered2)
	#print count_filter2.most_common(10)
	#print "\n"

	stemmer = PorterStemmer()
	stemmed1 = stem_tokens(filtered1, stemmer)
	stemmed2 = stem_tokens(filtered2, stemmer)
	count_stem1 = Counter(stemmed1)
	count_stem2 = Counter(stemmed2)
	
	stemmed1=' '.join(stemmed1)
	stemmed2=' '.join(stemmed2)
	#print stemmed1
	#print stemmed2
	documents=[stemmed1,stemmed2]
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
	#print tfidf_matrix.shape

	#print cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
	return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
Esempio n. 28
0
def tokenize(text):
	lookup_t = nltk.word_tokenize(text)
	nltk.data.load('tokenizers/punkt/spanish.pickle')
	stops_es=stopwords.words('spanish')
	stops_en=stopwords.words('english')
	stops_custom = ['http']
	tokenized = [word.lower() for word in lookup_t if word not in stops_es and word not in stops_custom and word not in stops_en and word.isalpha() and len(word)>2]
	return tokenized
Esempio n. 29
0
 def _remove_bigram_stopwords(self,bigrams):
     filtered_words = []
     for w in bigrams:
         if (w[0] in stopwords.words('english')) and (w[1] in stopwords.words('english')):
             pass
         else:
             filtered_words.append(w)
     return filtered_words
Esempio n. 30
0
	def PreparaFiltroTexto(self, recurso):
		tittletype = ""	
		for s in recurso.split():
			if s.lower() in stopwords.words('english') or s.lower() in stopwords.words('spanish'):
				tittletype = tittletype + s.lower() + ' ' 
			else:
				tittletype = tittletype + s.title() + ' ' 
		tittletype = tittletype[:-1]
		return tittletype
import numpy as np
from numpy import dot
from numpy.linalg import norm
import math
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

#### Set parameters
N_dimension = 2000
Tau = 3
Forgetting_rate = 0

#### Get unique words in the corpus
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words("english")

######################################
####         DEFINNITIONS         ####
######################################

#############################
#### COSINE SIMILARITY
#############################


def Cosine(x, y):
    z = 0
    if sum(x) != 0 and sum(y) != 0:
        z = dot(x, y) / (norm(x) * norm(y))
    return z
df2.columns = ['rev', 'p']
df3 = pd.read_csv('twitter train.csv', delimiter=',', encoding='ISO-8859-1')
df3.drop('ItemID', axis=1, inplace=True)
df3 = df3.rename(columns={'SentimentText': 'rev', 'Sentiment': 'p'})

frames = [df2, df1, df3]

result = pd.concat(frames)

result

df = shuffle(result)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

sw = set(stopwords.words('english'))
df = df
for i in range(len(df)):
    review = df['rev'][i]
    review = ' '.join(review.split(' ')[1:])
    review = review.lower()
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.split(' ')

    review = ' '.join(list(filter(None, review)))
    words = word_tokenize(review)
    filt = [w for w in words if not w in sw]
    df['rev'][i] = ' '.join(filt)

df.to_csv('finalmixrev.csv', sep='\t', encoding='utf-8')
Esempio n. 33
0
def create_new_blog():
    if request.method == 'GET':
        return render_template('new_blog.html')
    else:
        title = request.form['title']
        user = User.get_by_email(session['email'])

        new_blog = Blog(user.email, title, user._id)
        new_blog.save_to_mongo()

        new_title = title.replace(" ", "_")  # to be able to use it in the url
        my_url = "https://www.rottentomatoes.com/m/" + new_title + "/reviews/"
        # case sensitivity in url gets corrected automatically by browser

        # obtain the reviews of the required movie
        req = requests.get(my_url)
        content = req.content
        soup = BeautifulSoup(content, "html.parser")
        element = soup.find_all("div", {"class": "the_review"})

        if len(element) == 0:
            new_url = req.url + "/reviews/"
            req = requests.get(new_url)
            content = req.content
            soup = BeautifulSoup(content, "html.parser")
            element = soup.find_all("div", {"class": "the_review"})

        # preparing test set
        test_set_reviews = []
        for i in range(len(element)):
            test_set_reviews.append(element[i].text)

        corpus2 = []

        for i in range(len(test_set_reviews)):
            review = re.sub(
                '[^a-zA-Z]', ' ', test_set_reviews[i]
            )  # remove numbers and punctuations (don't remove letters a-z and A-Z) and second parameter ' ' is used to replace the removed character by a space.
            review = test_set_reviews[i].lower(
            )  # convert all letters to lowercase
            review = review.split(
            )  # convert the review into a list of different words of the review.

            ps = PorterStemmer(
            )  # Stemming process to keep only the root of the word i.e. keep 'love' and not 'loved'
            stop_words = set(stopwords.words('english'))
            stop_words.update(
                ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
            review = [
                ps.stem(word) for word in review if not word in stop_words
            ]  # retain only those english words in the list that are not present in stopwords. 'set' is used to make the algo faster because python goes through a set faster than a list

            review = ' '.join(
                review
            )  # convert the list of words back to a single string of words.
            if review == '' or review == ' ':  # sometimes, after applying
                review = 'neutral'

            corpus2.append(review)

        if len(corpus2) == 0:  # if no reviews found
            return "Sorry! No reviews yet for this movie. Please check spelling or try some other movie."

        # create the bag of words
        from sklearn.feature_extraction.text import TfidfVectorizer

        vectorizer = TfidfVectorizer(ngram_range=(1, 3))
        x_train = vectorizer.fit_transform(corpus)
        x_train = x_train.astype('float16')

        x_test = vectorizer.transform(corpus2)
        x_test = x_test.astype('float16')

        # fitting SGD Classifier
        from sklearn.linear_model import SGDClassifier

        classifier_sgd = SGDClassifier(loss='hinge',
                                       shuffle=True,
                                       penalty='elasticnet',
                                       alpha=0.00001)
        classifier_sgd.fit(x_train, sentiment_train)

        # predict
        y_pred_sgd = classifier_sgd.predict(x_test)

        res = 0
        for i in range(len(y_pred_sgd)):
            if y_pred_sgd[i] == 4:
                y_pred_sgd[i] = 3
            elif y_pred_sgd[i] == 0:
                y_pred_sgd[i] = 1

        for i in range(len(y_pred_sgd)):
            if y_pred_sgd[i] == 1:
                res += 0
            elif y_pred_sgd[i] == 2:
                res += 50
            else:
                res += 100
        rate = res / (len(y_pred_sgd))
        rate = str(rate)

        from collections import Counter
        data = Counter(y_pred_sgd)
        ans = data.most_common(1)[0][0]  # Returns the highest occurring item

        if ans == 1:
            return "Negative Reviews!! Drop this Movie. " + "rating is : " + rate
        elif ans == 2:
            return "Neutral Reviews!! Go at your own risk. :) " + "rating is : " + rate
        elif ans == 3:
            return "Positive Reviews!! Go for it. " + "rating is : " + rate
        else:
            return "Sorry! Some Error in Processing"
Esempio n. 34
0
    )"""
regex_str = [
    emoticons_str,
    r'<[^>]+>',  # HTML tags
    r'(?:@[\w_]+)',  # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)',  # anything else
]

# Create stop word dictionary
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'amp', 'get', 'gt', '1', '10', 'click']

tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)


def tokenize(s):
    s = re.sub(r'[^\x00-\x7f]*', r'', s)
    return tokens_re.findall(s)

def preprocess(s):
    tokens = tokenize(s)
    # To lower
    tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
Esempio n. 35
0
                    int(count) + 1) * math.log(total_docs / int(curr))
                term_freq_dict[doc_id][field_idx + 1] += 1
                term_freq_dict[doc_id][0] = max(term_freq_dict[doc_id][1:7])
    total_results = len(term_freq_dict)
    results = sorted(term_freq_dict.items(),
                     key=lambda x: (x[1], x[0]),
                     reverse=True)
    final_result = [x[0] for x in results]
    return (final_result[:min(num_results, len(results))], total_results)


if len(sys.argv) < 3:
    print('Insufficient Arguments provided')
    exit(0)

STOP_WORDS_SET = set(stopwords.words('english'))
STEMMER = Stemmer('porter')
data_dir = os.path.join('.', "data")
field_type_to_index = {
    'title': 0,
    'body': 1,
    'ref': 2,
    'infobox': 3,
    'link': 4,
    'category': 5
}
secondary_index = {
    'title': [],
    'body': [],
    'ref': [],
    'infobox': [],
Esempio n. 36
0
#Natural language processing
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
nltk.download('stopwords')
list1 = []
for i in range(0, 5536):
    mail = df.Message[i]
    #print(mail)
    mail = re.sub('[^a-zA-Z]', ' ', mail)
    mail = mail.lower()
    mailwords = mail.split()
    mailwords = [
        ps.stem(word) for word in mailwords
        if word not in stopwords.words('english')
    ]
    mail = ' '.join(mailwords)
    list1.append(mail)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(list1).toarray()
y = df.Status.values

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
Esempio n. 37
0
BOOK_LIST = ['hobbes-leviathan', 'hobbes-liberty', 'hobbes-elements', 'hobbes-law', 'mill-liberty', 'mill-util','locke-understanding',
             'locke-treatise', 'hume-treatise', 'hume-morals', 'hume-enquiry', 'berkeley-TOK','berkeley-TD', 
             'bentham-POM', 'bentham-FOG', 'mill-representative', #'burke-reflections','conway-nature','mill-comte','more-utopia',
             'reid-mind', 'hume-religion']  # this is the booklist we will analyse. Must be in the same folder
TEST_FILES = ['sidgwick.txt','machiavelli.txt','more-utopia','burke-reflections','smith-sentiments','smith-wealth',
             'fedPapers', 'mill-logic', 'kant-CPR', 'russell-AOM', 'russell-external', 'russell-ideals',
              'russell-mysticism', 'russell-POP', 'spinoza-ethica', 'spinoza-understanding','Shi-PC', 'Shi-equality',
              'Shi-AM', 'Shi-MP']
NUM_CLUSTERS = 6 # how many clusters we want to categorize when we process different individual books.
SENTIMENT_LIST = []

# In[9]:

#Adding more stopwords. Providing the option of an aggressive word list.
# nltk.download('stopwords')  #Not necessary if you have done it once
stop_words = list(set(stopwords.words('english')))
stop_words.append('\'s')#manually add 's into the stop word list (because it's annoying!) We may add more similar ones.

if MORE_SW:   #if we want to add more stop words and render a more aggressive stopword list
    with open('stopwords', 'r') as myfile:
        sw = [i.strip().split(' ') for i in myfile]
    sw1 = [val.lower() for sublist in sw for val in sublist]
    stop_words.extend(sw1)
    stop_words = set(stop_words)


# In[11]:

def tokenize(text):
    '''
    Tokenize the words in a texts. If we need tokenize and stemming, we can 
@author: bhavyababuta
"""

import pandas as pd
import numpy as np
import re
import matplotlib 
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
stopWordList=stopwords.words('english')


data=pd.read_csv('Tweets.csv')
data.head(20)

data.isnull().sum()
data.describe(include='all')

for columns in data.columns:
    print('%s'%(columns))
    data[columns].value_counts()
    
data['airline'].value_counts()
data['retweet_count'].value_counts()
    def get_summary(self, input, max_sentences):
        sentences_original = sent_tokenize(input)

        #Remove all tabs, and new lines
        if (max_sentences > len(sentences_original)):
            print(
                "Error, number of requested sentences exceeds number of sentences inputted"
            )
            #Should implement error schema to alert user.
        s = input.strip('\t\n')

        #Remove punctuation, tabs, new lines, and lowercase all words, then tokenize using words and sentences
        words_chopped = word_tokenize(s.lower())

        sentences_chopped = sent_tokenize(s.lower())

        stop_words = set(stopwords.words("english"))
        punc = set(string.punctuation)

        #Remove all stop words and punctuation from word list.
        filtered_words = []
        for w in words_chopped:
            if w not in stop_words and w not in punc:
                filtered_words.append(w)
        total_words = len(filtered_words)

        #Determine the frequency of each filtered word and add the word and its frequency to a dictionary (key - word,value - frequency of that word)
        word_frequency = {}
        output_sentence = []

        for w in filtered_words:
            if w in word_frequency.keys():
                word_frequency[w] += 1.0  #increment the value: frequency
            else:
                word_frequency[w] = 1.0  #add the word to dictionary

        #Weighted frequency values - Assign weight to each word according to frequency and total words filtered from input:
        for word in word_frequency:
            word_frequency[word] = (word_frequency[word] / total_words)

        #Keep a tracker for the most frequent words that appear in each sentence and add the sum of their weighted frequency values.
        #Note: Each tracker index corresponds to each original sentence.
        tracker = [0.0] * len(sentences_original)
        for i in range(0, len(sentences_original)):
            for j in word_frequency:
                if j in sentences_original[i]:
                    tracker[i] += word_frequency[j]

        #Get the highest weighted sentence and its index from the tracker. We take those and output the associated sentences.

        for i in range(0, len(tracker)):

            #Extract the index with the highest weighted frequency from tracker
            index, value = max(enumerate(tracker), key=operator.itemgetter(1))
            if (len(output_sentence) + 1 <= max_sentences) and (
                    sentences_original[index] not in output_sentence):
                output_sentence.append(sentences_original[index])
            if len(output_sentence) > max_sentences:
                break

            #Remove that sentence from the tracker, as we will take the next highest weighted freq in next iteration
            tracker.remove(tracker[index])

        sorted_output_sent = self.sort_sentences(sentences_original,
                                                 output_sentence)
        return (sorted_output_sent)
    target_word_2 = list[3].lower()
    target_word_3 = list[4].lower()
    f.close()

    # large capital -> small capital
    discourse_words_1 = [s.replace(s, s.lower()) for s in discourse_words_1]
    discourse_words_1and2 = [s.replace(s, s.lower()) for s in discourse_words_1and2]

    # remove '.' and ',' from word list
    discourse_words_1 = [s.replace('.', '') for s in discourse_words_1]
    discourse_words_1and2 = [s.replace('.', '') for s in discourse_words_1and2]
    discourse_words_1 = [s.replace(',', '') for s in discourse_words_1]
    discourse_words_1and2 = [s.replace(',', '') for s in discourse_words_1and2]

    # remove stop words from word list
    stop_words = stopwords.words('english')
    #print(stop_words)
    for stop_word in stop_words:
        while stop_word in discourse_words_1 :
            discourse_words_1.remove(stop_word)
            
        while stop_word in discourse_words_1and2 :
            discourse_words_1and2.remove(stop_word)
            
    # remove "'s" and "'" and "-" and "'d" and "'ll" and "'ve" and "re" from word list
    discourse_words_1 = [s.replace("'s", '') for s in discourse_words_1]
    discourse_words_1and2 = [s.replace("'s", '') for s in discourse_words_1and2]
    discourse_words_1 = [s.replace("'", '') for s in discourse_words_1]
    discourse_words_1and2 = [s.replace("'", '') for s in discourse_words_1and2]
    discourse_words_1 = [s.replace("-", '') for s in discourse_words_1]
    discourse_words_1and2 = [s.replace("-", '') for s in discourse_words_1and2]
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
import pandas as pd
import scipy.stats as sp

tokenizer = WhitespaceTokenizer()

nltk.download()

stopword_list = stopwords.words('english')

reviews_df = pd.read_csv("C:/Users/Documents/Yelp/yelp_academic_dataset_review.csv", encoding="utf-8")

positive_terms = []
f = open('C:/Users/Documents/Yelp/positive_terms.txt', "r")
positive_terms = f.read().splitlines()
f.close()

negative_terms = []
f = open('C:/Users/Documents/Yelp/negative_terms.txt', "r")
negative_terms = f.read().splitlines()
f.close()

porter = nltk.PorterStemmer()
def normalize_review_text(text):
    text = text.lower()
    text = remove_punctuation(text)
    text = " ".join(text.split())
    text_tokens = tokenizer.tokenize(text)
    text_tokens = [porter.stem(w) for w in text_tokens if w not in stopword_list]
Esempio n. 42
0
                                         for node in node_info)

#making sure keys are integers
node_info_tokenized = {int(k): v for k, v in node_info_tokenized.items()}

#with open('./ISAE_Comp/out/node_info_token.json', 'w') as file:
#    json.dump(node_info_tokenized, file)
print("Finished tokenizing {0} entries to dictionnary and saved it to file".
      format(len(node_info_tokenized.keys())),
      flush=True)
'''
    Removing stopwords
'''
print("Downloading french stopwords", flush=True)
nltk.download('stopwords')
stop_words = stopwords.words('french')

node_info_filtered = {}


def remove_stopwords(node):
    '''
        add an entry on node_info_filtered dict for the node as word list removing stopwords from node_info_tokenized
    '''
    node_info_filtered[node] = []
    for w in node_info_tokenized[node]:
        if w not in stop_words:
            node_info_filtered[node].append(w)


print("Starting stopword removal", flush=True)
Esempio n. 43
0
        b.append(element)
        return b


def category(a):
    return {
        '1': 'Negative',
        '2': 'S Negative',
        '3': 'Neutral',
        '4': 'S Positive',
        '5': 'Positive'
    }.get(a)


#Build a training data set
stop = stopwords.words('english')
with open("train.tsv") as csvfile:
    records = csv.reader(csvfile, delimiter='\t')
    next(records)
    t = [({
        word: True
        for word in nltk.word_tokenize(row[2]) if word not in stop
    }, (row[3])) for row in records]
print('Train record count: ' + str(len(t)))
##trainlen = int((len(t) * 3 / 4))
##train = t[:trainlen]
##test = t[trainlen:]

##test file data for later.  Might want to incorporate a database read
with open("test.tsv") as csvfile:
    records2 = csv.reader(csvfile, delimiter='\t')
Esempio n. 44
0
from tkinter.filedialog import askopenfilename
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

idenditify = [
    '\033[92m' + 'Ad   : Aykut' + '\033[0m',
    '\033[92m' + 'Soyad: Cengiz' + '\033[0m',
    '\033[92m' + 'No   : 503020190030' + '\033[0m',
    '\033[92m' + '<Information Retrieval Final Project>' + '\033[0m'
]

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
snowy = SnowballStemmer("english")
translator = str.maketrans('', '', string.punctuation)

Tk().withdraw()

baslıklar = [
    'doga', 'bilim', 'hukuk', 'din', 'ekonomi', 'is', 'moda', 'siyaset', 'spor'
]

diseaseAllergie = dict()
diseaseAnxiety = dict()
diseaseBipolar = dict()
diseaseBrainTumour = dict()
diseaseBreastCancer = dict()
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.chunk import ne_chunk

sentences = [
'Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages',
'Modern NLP algorithms are based on machine learning, especially statistical machine learning', 
'NLP research is gradually shifting from lexical semantics to compositional semantics and, further on, narrative understanding',
'The learning procedures used during machine learning automatically focus on the most common cases, whereas when writing rules by hand it is often not obvious at all where the effort should be directed',
'Produce a readable summary of a chunk of text. Often used to provide summaries of text of a known type, such as articles in the financial section of a newspaper'
]

stopword_set = set(stopwords.words('english'))

def preprocessing(sentence):
    tokenized = set(word_tokenize(sentence))
    tokenized = tokenized - stopword_set
    stemmed = [(PorterStemmer().stem(i)) for i in tokenized]
    lemmatized = [(WordNetLemmatizer().lemmatize(i)) for i in stemmed]
    return set(lemmatized)

def menu_1():
    global user_sentences

    while True:
        inp = input('Input Sentence : ')
        if len(inp) > 10:
            break
Esempio n. 46
0
for i in battle_text_rep:
    for j in i:
        battle_text.append(j)


# In[10]:


battle_text = [x.lower() for x in battle_text if x not in ['—', '==', '===', '', '====']]


# In[11]:


battle_stopwords = []
stopword = stopwords.words("russian")
for i in tqdm_notebook(battle_text):
    if i not in stopword:
        battle_stopwords.append(i)


# In[12]:


dict_freq = sorted(Counter(battle_stopwords).items(), key=lambda x: x[1], reverse=True)


# In[13]:


final_battle = []
def bag_of_words(text,url_path):
          tfidfconverter = TfidfVectorizer(max_features=100, min_df=5, max_df=0.7,stop_words=stopwords.words('english'))          
          X = tfidfconverter.fit_transform(text)  ###.toarray()
          rows=tfidfconverter.get_feature_names()
          BOW_dataset=pd.DataFrame(X.T.todense(),index=rows)
          url_path=os.path.dirname(url_path)
          scipy.sparse.save_npz(os.path.join(url_path , "keywords.npz"), X)
          BOW_dataset.to_csv(os.path.join(url_path , "keywords.csv"))   # dictionary of keywords
        print("Welcome to the Search Engine\n")
        while continueLoop:
            fromUser = ""
            user_query = ""
            print("\n\nSelect from the Following Options:\n\t1.) Search\n\t2.) Exit")
            from_user = input("Your Choice: ")

            if from_user == "1":
                # NOTE: this function is raw_input for Python 2.x
                print("\nSearching through the ''{0}'' File Cache:".format(doc_basename))
                user_query = input("What Is Your Query?:  ")
                formatted_query = (re_sub(r"[^a-zA-Z0-9_ ]+", "", user_query.lower().strip())).split()
                query = []
                for i in range(0, len(formatted_query)):
                    if formatted_query[i] not in stopwords.words("english"):
                        query.append(stemmer.stem(formatted_query[i]))

                vsm = VSMClass(iic, doc_basename)
                qr = QueryClass(query, vsm)
                qr.computeSimilarities(10)

                # first index = location of unprocessed documents; second index = list of documents in order of similarity > 0
                location_and_documents = getDocuments(qr.all_similarities, iic, doc_location, query)

                if len(location_and_documents[1]) > 0:
                    print("\nResults:")
                    for i in range(0, len(location_and_documents[1])):
                        # NOTE: this might be yielding an encoding error
                        try:
                            print("\t\tURL:\t{0}".format(location_and_documents[1][i]))
Esempio n. 49
0
import os.path as osp
import torch
from torch.utils.data import Dataset
import transformers
import string
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
import logging

UNK = '[UNK]'
nltk.download('stopwords')
nltk.download('punkt')
STOP_WORDS = stopwords.words('english')
DROPPED = STOP_WORDS + list(string.punctuation)
CATEGORY_IDS = {'1-to-1': 0, '1-to-many': 1, 'many-to-1': 2, 'many-to-many': 3}


def file_to_ids(file_path):
    """Read one line per file and assign it an ID.

    Args:
        file_path: str, path of file to read

    Returns: dict, mapping str to ID (int)
    """
    str2id = dict()
    with open(file_path) as file:
        for i, line in enumerate(file):
            str2id[line.strip()] = i
Esempio n. 50
0
def group_data(roots_df: pd.DataFrame, notes_df: pd.DataFrame, w2v,
               tokenizer) -> pd.DataFrame:
    """Group the roots and notes data for modeling."""
    # map each note_id to its tokens
    note_map = dict(notes_df.loc[:, ["note_id", "text"]].values)
    hadm_map = dict(notes_df.loc[:, ["note_id", "hadm_id"]].values)

    # join icd roots with notes
    print("Merging note and roots .....")
    df = roots_df.merge(notes_df, on="hadm_id", how="inner").dropna()

    # group by admission
    print("Grouping by hadm id .....")
    df = df.groupby("hadm_id").aggregate(list).reset_index()

    # get unique roots and notes per grouping
    print("Replicating notes .....")
    df["roots"] = df["roots"].apply(lambda x: list(set(x)))
    df["note_id"] = df["note_id"].apply(lambda x: list(set(x)))

    # replicate root lists for each note they are related to
    roots = list(
        it.chain.from_iterable(
            map(lambda r, nids: [r] * len(nids), df["roots"].tolist(),
                df["note_id"].tolist())))

    # flatten note ids
    note_ids = list(it.chain.from_iterable(df["note_id"].tolist()))

    # flatten notes grouped by hadm_id
    notes = [note_map[nid] for nid in note_ids]

    # reassign hadm_id for each note id
    hadm_ids = [hadm_map[nid] for nid in note_ids]

    # store the resulting replications in a modeling df
    model_df = pd.DataFrame({
        "roots": roots,
        "text": notes,
        "hadm_id": hadm_ids
    })

    # tokenize and remove stop words
    print("Creating tokens .....")
    all_stops = set(stopwords.words("english"))
    model_df["tokens"] = model_df["text"]\
        .apply(lambda t: [w for w in word_tokenize(t) if w not in all_stops])

    # remove rows with no tokens from word2vec
    model_df["tokens"] = model_df["tokens"]\
        .apply(lambda x: [t for t in x if t in w2v])
    model_df["tokens"] = model_df["tokens"]\
        .apply(lambda x: None if len(x) == 0 else x)
    model_df = model_df.dropna()

    # average word embeddings to generate d2v embeddings
    print("Creating d2v .....")
    model_df["d2v"] = model_df["tokens"]\
        .apply(lambda doc: list(np.mean([w2v[t] for t in doc if t in w2v],
                                        axis=0)))

    # get column for embedding indices
    print("Creating w2v indices .....")
    model_df["w2v_idx"] = model_df["tokens"]\
        .apply(lambda doc: [w2v.vocab[w].index for w in doc if w in w2v])

    # get bert embeddings indices
    print("Creating bert indices .....")
    model_df["bert_idx"] = model_df["text"]\
        .apply(lambda doc: torch.tensor(tokenizer\
                                        .encode(doc, add_special_tokens=True))\
               .unsqueeze(0))

    # one hot encode labels
    mlb = MultiLabelBinarizer()
    model_df["roots"] = mlb.fit_transform(model_df["roots"])

    return model_df, mlb.classes_
def indexer():
 with codecs.open("valid_URL.txt","r",encoding='utf8') as fh_book:
	global word_freq_title_final
	global document_count
	global word_freq_final
	global link_analysis
	outLinks = []
	for line in fh_book:
		info = line.split()
		path = info[0]
		url = info[1]
		print "Path :" + str(path)
		url = "http:" + url
		if(path == "39/373") or (path == "56/176") or (path == "10/451") or (path == "55/433"):
			print "Pass_bad_URL_hardCode"
			continue
		return_val = is_valid(url)
		if return_val == True:
			if magic.from_file(path).startswith('HTML') or magic.from_file(path).startswith('XML'):
				document_count += 1
				fh = codecs.open(path,'r',encoding='utf8')
				soup = BeautifulSoup(fh,'lxml')
				fh.close()
				#TODO comment after first run

				[x.extract() for x in soup.find_all('script')]
				sample_list = soup.get_text().lower()
				#comment next two lines
				outLinks = extract_next_links(soup,url)
				link_analysis[url] = outLinks
			elif magic.from_file(path).startswith('ASCII') or magic.from_file(path).startswith('UTF'):
				document_count += 1
				fh = codecs.open(path,'r',encoding='utf8')
				sample_list = fh.read()
			else:
				continue
			tokenizer = RegexpTokenizer(r'\w+')
			punct_remove = tokenizer.tokenize(sample_list)
			token_list_stopwords = [word for word in punct_remove if not word in stopwords.words('english')]
			stemmer = PorterStemmer()
			stemmed_list = stem_porter(token_list_stopwords, stemmer)
			word_freq = Counter(stemmed_list)
			word_freq_title_final = processTitle(soup,path,stemmed_list)
			tags = processTags(soup,path)
			tag_final = createTagIndex(tags,path,stemmed_list)
			for word in word_freq:
				  # TODO : add check conditions from below
				 if(checkCondition7(word)):
				  indices = [i for i, x in enumerate(stemmed_list) if x == word]
				  length = word_freq[word]
				  totallength = len(word_freq)
				  posting = {}
				  posting["docID"] = path
				  # posting["occurences"] = indices
				  posting["TF"] = length
				  if(word_freq_final.get(word) == None):
					sample_list = list()
					sample_list.append(posting)
					word_freq_final[word] = sample_list
				  else:
					sample_list1 = word_freq_final.get(word)
					sample_list1.append(posting)
					word_freq_final[word] = sample_list1
	writeTitleIndex(word_freq_title_final)
	writeWordIndex(word_freq_final)
	writeLinks(link_analysis)
	writeTagIndex(tag_final)
Esempio n. 52
0
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
corpus = []
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
Esempio n. 53
0
        del tokens
        sys.stdout.flush()
        i += 1
    print(multiprocessing.current_process().name +
          ' has finished processing files')
    sys.stdout.flush()
    q.put([stemmed_corp, original_corp])


text_filepath = 'C:/Users/Matt/Documents/Data Science/CW/CLEANED_3/'
root_cleaned_filepath = 'C:/Users/Matt/Documents/Data Science/CW/WORDCLOUD_3/'
blacklist = ['document_process.csv']
blacklist_words = [
    'ptl', 'lukes', 'june', 'leads////', 'leads/////', 'leads//////'
]
custom_stopwords = stopwords.words('english') + blacklist_words + [
    punc for punc in string.punctuation
]
# stemmer for reducing words
stemmer = PorterStemmer()
# storing stemmed tokens
stemmed_corpus = []
# storing non-stemmed tokens
original_corpus = []
# list of currently running threads
process_list = []
# queue of information processed by threads
q = multiprocessing.Queue()
# testing
# -1 for all files
filesToIter = 2
df_count.plot(x='category', y='number', kind='bar')
plt.show()

# cleaning dataset
stemmer = PorterStemmer()
corpus = []

for w in range(len(df['Message'])):
    msg = df['Message'][w]
    msg = re.sub("[^a-zA-Z]", " ", msg)
    msg = msg.lower()
    msg = msg.split()
    msg = [
        stemmer.stem(word) for word in msg
        if not word in set(stopwords.words('english'))
    ]
    msg = " ".join(msg)
    corpus.append(msg)

# create word vector
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tf.fit(corpus)
# print(tf.vocabulary_)
X = tf.transform(corpus).toarray()

Y = df['Category']

# train test split
from sklearn.model_selection import train_test_split
Esempio n. 55
0
def func_tokenize(raw_input):
    try:
        #stop_words = set(stopwords.words('english'))
        new_words_list=stopwords.words('english')
        new_words_list.append('home')# adding home to stop words list because it is in every document
        stop_words = set(new_words_list)
    except:
        print ('Error creating stop words.  Please verify the stopwords were imported prior to running this program')
        print ('Run the following commands in a python shell to download the stop words')
        print ('import nltk')
        print ('nltk.download("stopwords")')
    try:        
        try:
            #tags = re.compile('(b\')((\<script.*?\>).*?(\<\/script\>))|((\<style.*?\>).*?(\<\/style\>))|(\<.*?\>)|(\<.*?\/\>)|(\<\/.*?\>)|(&\w+;)|(html)|(\\\\n)|(\\\\x\w\w)',re.DOTALL) #works at removing style tags
            #tags = re.compile('(b\')((<script.*?>).*?(</script>))|((<style.*?>).*?(</style>))|(<.*?>)|(<.*?/>)|(</.*?>)|(&\w+;)|(html)|(\\\\n)|(\\\\x\w\w)',re.DOTALL) #works at removing style tags
            #tags = re.compile('(<script>.*?</script>)|(<noscript>.*?</noscript>)|(<!--.*?-->)|(<.*?>)|(<.*?>\w)',re.DOTALL)
            #tags = re.compile('(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|(<.*?>)|((\\u[0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ]+)*)',re.DOTALL)
            #tags = re.compile('(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)',re.DOTALL)
            #tags = re.compile('(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|([\\u2000-\\u2100])|(\\u00f8)|(\\u00b0)|([\\u0500-\\u0600])|([\\u5000-\\u6000])',re.DOTALL)
            tags = re.compile('(^<.*?>)|(^<!.*?>)|(^<script>.*?</script>)|(^<noscript>.*?</noscript>)|([\\u0080-\\uFFEF])',re.DOTALL)
            #tags = re.compile(r'(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|(\\u\d*[\s|\w*])',re.DOTALL)
            #tags = re.compile(r'(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|([^\\u0200-\\uFFFF])',re.DOTALL)##attempt to remove unicode
            reg_numbers = re.compile(r'(\s\d+\s)')
        except:
            print ('Error in regex', sys.exc_info()[0], sys.exc_info()[1])
 
        ### the following section uses Python 3 conventions
        #try:
            ##tr = str.maketrans(" ", " ", string.punctuation)#used to strip punctuation ## need to change for python 2   THis is python 3
        #except:
            #print ('Error removing punctuation', sys.exc_info()[0])     
        ### End Python 3 section
        #strip unicode from string
        try:
            raw_input = (raw_input.decode('unicode_escape').encode('ascii','ignore')) ##
        except:
	    #print ('Error removing unicode characters from line var', sys.exc_info()[0], sys.exc_info()[1])
            pass
	    
        try:
            #line = tags.sub(' ',str(raw_input)) #remove html tags ##python 3 code
            line = re.sub(tags,' ',str(raw_input)) #remove html tags
        except:
            print ('Error removing html tags', sys.exc_info()[0], sys.exc_info()[1])
        try:
            
            #line= (line.lower().translate(tr).split())#convert line to lower case, remove punctionation and tokenize this uses python 3 requires uncommenting 
            #line= (line.lower().translate(None, string.punctuation).split())#convert line to lower case, remove punctionation and tokenize #This is Python2 version
            #right_num_spaces=" "*256
            punctuation =re.compile('['+string.punctuation+']')
            line= re.sub(punctuation,' ',line)#remove punctuation with regex but replace with a space to preserve words
            #line = re.sub(reg_numbers,'',line)#remove numbers from string
            line=line.lower().split()#convert to lowercase and split into words
           
            
        except:
            print ('Error Changing case, removing punctuation and spliting', sys.exc_info()[0], sys.exc_info()[1])  
                     
        try:
            line=[word for word in line if word not in stop_words] #remove stop words from raw line
        except:
            print ('Error with stop words', sys.exc_info()[0], sys.exc_info()[1])           
        try:
            stemmer = PorterStemmer() #create a stemmer with the nltk porter stemmer               
            line=[stemmer.stem(term) for term in line] #use nltk stemmer to convert to word roots
        except:
            print ('Error with stemming', sys.exc_info()[0], sys.exc_info()[1])
            pass
        return line
    except:
        print ('Error in tokenizer function', sys.exc_info()[0], sys.exc_info()[1])
        pass
Esempio n. 56
0
topic summary
"""

import pickle
import random

# for consistent testing
random.seed(1532525625823)

raw_data = pickle.load(open("pickles/list-of-reviews.p", "rb"))

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords

count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
train_counts = count_vect.fit_transform(random.sample(raw_data, 30000))

raw_data = None
btr = pickle.load(open("pickles/dict-of-business-to-reviews.p", "rb"))

test_counts = count_vect.transform(btr["Appliance Service Center"])

tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
test_tfidf = tfidf_transformer.transform(test_counts)

dtm = train_tfidf
dtm_test = test_tfidf

vocab = count_vect.get_feature_names()
Esempio n. 57
0
import pandas as pd
import numpy as np
import pickle

from DataFormatter import create_dataset
from nltk.corpus import stopwords

# EGC
en_stop = set(stopwords.words('french'))
en_stop.add('les')
en_stop.add('a')
en_stop.add('ce')
en_stop.add('cet')
en_stop.add('cette')
en_stop.add('article')
en_stop.add('approche')
en_stop.add('données')
en_stop.add('non')
en_stop.update(set(stopwords.words('english')))

data = pd.read_csv('Data/egc.csv', sep="\t")

data['txt'] = data['title'].astype(str) + ". " + data['abstract'].astype(str)
doc_set = list(data['txt'])
years = np.array(data['year'])
years = years.flatten().tolist()

dataset = create_dataset(doc_set, years, en_stop, l=5, max_df=0.75, min_df=5)
pickle.dump(dataset, open("Data/egc.dwe", "wb"))

inpu = pickle.load(open("Data/egc.dwe", "rb"))
Esempio n. 58
0
def run(queryList):

    # stemmer = PorterStemmer()
    stemmer = SnowballStemmer("english")

    f = open("data/expanded.txt", "w+")
    for query in queryList:
        querySplitted = query.split(",")

        # tokenizing the query
        tokens = nltk.word_tokenize(querySplitted[1])

        # removing stop words in the query
        filtered_words = [
            word for word in tokens if word not in stopwords.words('english')
        ]

        # pos tagging of tokens
        pos = nltk.pos_tag(filtered_words)

        synonyms = []  # synonyms of all the tokens

        index = 0
        # iterating through the tokens
        for item in filtered_words:
            synsets = wordnet.synsets(item)

            if not synsets:
                # stemming the tokens in the query
                synsets = wordnet.synsets(stemmer.stem(item))

            # synonyms of the current token
            currentSynonyms = []
            currentPOS = get_wordnet_pos(pos[index])

            # iterating through the synsets
            for i in synsets:
                # first we check if token and synset have the same part of speech
                if str(i.pos()) == str(currentPOS):
                    for j in i.lemmas():
                        if j.name() not in currentSynonyms:  # if we have not
                            currentSynonyms.append(j.name().replace("_", " "))
                synonyms.append(currentSynonyms)
            index += 1

        f.write(querySplitted[0] + ", " + querySplitted[1] + ", ")

        # removing duplicate lists in the synonyms list
        tmp = []
        for elem in synonyms:
            if elem and elem not in tmp:
                tmp.append(elem)
        synonyms = tmp

        # now that we have all the synonyms
        for x in itertools.product(*synonyms):
            current = ""
            for item in x:
                current += item
                current += " "
            current += ", "
            f.write(current)
        f.write("\n")
from __future__ import unicode_literals
from nltk.corpus import stopwords
import itertools
import string
import os

stop_words_nltk = stopwords.words("english")

stop_words_extra = [
    "right",
    "yeah",
    "okay",
    "ourselves",
    "hers",
    "between",
    "yourself",
    "but",
    "again",
    "there",
    "about",
    "once",
    "during",
    "out",
    "very",
    "having",
    "with",
    "they",
    "own",
    "an",
    "be",
    "some",
Esempio n. 60
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
import pickle
import numpy as np



mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
nltk.download("stopwords")

NGRAM_RANGE = (1, 5)
# слова н-граммы
TOKEN_MODE = 'char'
# максимальное кол-во фичей
TOP_K = 1000
# минимальная частота
MIN_DOCUMENT_FREQUENCY = 2


def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords \
              and token != " " \