Example #1
0
def normalize_nltk_snowball(token):
    # choose language english firstly
    # or you could import snowball englishstemmer directly
    #   use 'print(" ".join(SnowballStemmer.languages))))' to 
    #   check what languages snowball supports
    snowball = SnowballStemmer('english')
    return snowball.stem(token)
 def classify(self, sText):
    """Given a target string sText, this function returns the most likely document
    class to which the target string belongs (i.e., positive, negative or neutral).
    """
    tokens = self.tokenize(sText)
    posProbability, negProbability = 0, 0
    posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values()))
    stemmer = SnowballStemmer("english")
    for i in range(len(tokens) - 1):
        if not isPunctuationMark(tokens[i]):
            unigram = stemmer.stem(tokens[i])
            second_word = stemmer.stem(tokens[i + 1])
            try:
                bigram = unigram + " " + second_word
            except UnicodeDecodeError:
                continue
            #adds one smoothing and takes log to avoid underflow
            posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum)
            posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum)
            negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum)
            negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum)
    if tokens:
        posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum)
        negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum)
    if posProbability > negProbability:
        return "positive"
    else:
        return "negative"
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        stemmer = SnowballStemmer("english")
        stemmed_words = []
        for word in text_string.split():
            stemmed_words.append(stemmer.stem(word.strip()))
        words = " ".join(stemmed_words)

    return words
Example #4
0
    def _text_to_words(self, text):
        '''
        Processe un texte et retourne une liste de mots
        Le processing effectue les actions suivantes:
            - mise en minuscule du texte
            - tokenisation
            - retrait des stop_words
            - stemming des mots
        '''
        # On met le texte en minuscule
        text = text.lower().strip()

        # Tokenisation
        tokens = word_tokenize(text, language="english")

        # On retire les mots commencant par une apostrophe
        # (la tokenization transforme I'd like en ["I", "'d", "like"]
        #  et on pourrait se passer de "'d")
        tokens = [token for token in tokens if not token.startswith("'")]

        # stop_words
        # On retire les stop words de notre vecteur.
        # En plus des stopwords donnees avec la collection, je rajoute les mots courants
        # Anglais donnés par NLTK et la ponctuation (sauf parantheses car utile pour query bool)
        stop_words = self.stop_words + list(string.punctuation) + stopwords.words("english")
        tokens = [token for token in tokens if token not in stop_words]

        # Stemming
        stemmer = SnowballStemmer(language="english")
        tokens = [stemmer.stem(word) for word in tokens]

        return tokens
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated)
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""

    stemmer = SnowballStemmer("english")
    if len(content) > 1:
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
        
        split = text_string.split()  
        text = [stemmer.stem(word) for word in split]
        words = ' '.join(text)


    f.close()

    return words.strip()
Example #6
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.')
    parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref',
            help='input file (default data/hyp1-hyp2-ref)')
    parser.add_argument('-n', '--num_sentences', default=None, type=int,
            help='Number of hypothesis pairs to evaluate')
    # note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint
    opts = parser.parse_args()

    # we create a generator and avoid loading all sentences into a list
    def sentences():
        with open(opts.input) as f:
            for pair in f:
                yield [sentence.strip().split() for sentence in pair.split(' ||| ')]

    english_stemmer = SnowballStemmer("english")

    # note: the -n option does not work in the original code
    for h1, h2, ref in islice(sentences(), opts.num_sentences):
        # Perform morphological stemming before calculating METEOR score
        h1 = [english_stemmer.stem(word) for word in h1]
        h2 = [english_stemmer.stem(word) for word in h2]
        ref = [english_stemmer.stem(word) for word in ref]

        rset = set(ref)
        h1_match = meteor(h1, rset)
        # print "meteor is h1_match ", h1_match
        h2_match = meteor(h2, rset)
        # print "meteor is h2_match ", h2_match
        print(1 if h1_match > h2_match else # \begin{cases}
                (0 if h1_match == h2_match
                    else -1)) # \end{cases}
Example #7
0
	def get_stemm_tags(self, tags):
		stemm_tags = []
		current_stemmer = SnowballStemmer('english')
		for tag in self.tags:
			stemm_tags.append(current_stemmer.stem(tag.lower()))
		
		return stemm_tags
def tokenize(string, stem=True, entire=False):
    """
    INPUT: string
    OUTPUT: a list of words
    """
    string = string.replace("/", " ")
    string = string.replace("-", " ")
    tokenizer = PottsTokenizer(preserve_case=False)
    token_list = tokenizer.tokenize(string)
    punctuation = re.compile(r'[-.?!,":;$/*()|0-9]') # remove these punctuations and number 
    token_list = [punctuation.sub("", word) for word in token_list]  
    token_list = filter(None, token_list) #filters empty   

    #filter out stopwords 
    STOPWORDS = set(nltk.corpus.stopwords.words('english'))
    STOPWORDS.update(('would','does','got',"doesn't","it's","isn't","don't","i'm","i'll","i've", "=","can't","didn't","etc","+","%","won't","that's","nikon","g","&", "sure", "may", "yet", "ok","haven't","else","maybe","wouldn't","couldn't","via","rt","'","you're","almost","v","there's","#",'well','somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere'))
    if entire:
        # if need a larger set
        stopwords_entire_list = loadEntireStopWord()
        STOPWORDS.update(set(stopwords_entire_list))
    token_list = [word for word in token_list if word not in STOPWORDS]

    #stemmer 
    if stem:
        stemmer = SnowballStemmer("english")
        token_stem_list = [stemmer.stem(token) for token in token_list]
        token_list = token_stem_list

    return token_list
    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top, stem words
        and return a string that contains all the words
        in the email (space-separated)
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """
    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stemming each word,
        ### and appending the stemmed word to words
        words = text_string.strip().split()
        stemmer = SnowballStemmer("english")
        stemmed_text_string = ""
 
        for word in words:
            stemmed_text_string += stemmer.stem(word) + " "

    return stemmed_text_string.strip()
def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
    """
    :type s: str
    :type stem: bool
    :type use_re: bool
    :rtype: set(str)
    """
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    wordnet = WordNetLemmatizer()
    table = string.maketrans("","")

    if use_re:
        s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)

    if digit:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
    else:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))

    if stop:
        tokens = set(word for word in tokens if word not in stop_words)

    if stem:
        tokens = set(stemmer.stem(word) for word in tokens)

    return tokens
Example #12
0
    def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu'
Example #13
0
def text_cleaner_and_tokenizer(texts):
    """
    takes a list of sentences, removes punctuation, numbers, stopwords and stems.
    Then joins everything back together and returns the filtered texts as a list of unicode strings
    :param texts: list of unprocessed strings
    :return: list of unicode strings
    """
    i = 0
    stopword_list = set(stopwords.words('danish'))
    stemmer = SnowballStemmer("danish", ignore_stopwords=False)
    filtered_texts = []

    for sentence in texts:
        for symbol in punctuation:
            sentence = sentence.replace(symbol,'')
        for num in numbers:
            sentence = sentence.replace(str(num),'')
        sentence = sentence.decode('utf-8').lower()
        words_in_sentence = word_tokenize(sentence, language='danish')
        filtered_sentence = []
        for word in words_in_sentence:
            if word not in stopword_list:
                stem_word = stemmer.stem(word)
                filtered_sentence.append(stem_word)

        sentence = ' '.join(filtered_sentence)
        filtered_texts.append(sentence)

        i = i +1
        if i % 1000 == 0:
            print(i)
    print('Done :D!')
    return filtered_texts
Example #14
0
def stem_stopword_clean( vett_strings ):
    '''
    Prende un vettore di studenti o lavori ongli elemento delle lista unico e stemmato.
    Divide elementi composti da piu parole, rimuove le STOPwords
    :param vett_value: vettore di stringhe
    :return: vettore di parole stem senza stopwords
    '''

    # importo libreria per stem
    from nltk.stem.snowball import SnowballStemmer
    from nltk.corpus import stopwords

    stemmer = SnowballStemmer("italian")

    stop = set(stopwords.words('italian'))

    # logger.error(stemmer.stem("italian"))
    # logger.error(stemmer.stem("a"))
    # logger.error(stemmer.stem("andate tutti a correre"))

    documents=[]

    # logger.error(stop)

    stem_parola=''

    for frasi in vett_strings:
        for parola in frasi.split(" "):
            stem_parola=stemmer.stem(parola)
            if(stem_parola not in stop and stem_parola not in documents):
                documents.append(stem_parola)


    return documents
def pos_tokenizer(s): #define a tokenizer that uses POS tagging
    texts=nltk.word_tokenize(s)

    texts=[word for word in texts if len(word)>2]

    # PULL OUT NOUN AND VERB PHRASES
    chunktext=nltk.pos_tag(texts)
    patterns="""
                VP:{<V.*><DT>?<JJ.*>?<NN.*>}
                NP:{<DT>?<JJ>*<NN.*>}
                N:{<NN.*>}
    """
    NPchunker=nltk.RegexpParser(patterns)

    from nltk.stem.snowball import SnowballStemmer
    st=SnowballStemmer('english')

    #print text
    temp=[]
    result=NPchunker.parse(chunktext)
    #print result
    for phrase in result:
        try:
            phrase.label()
            string=''
            m=0
            for word in phrase:
                if m==0:
                    string+=st.stem(word[0])
                    m+=1
                else: string+=' '+st.stem(word[0])
            temp.append(string)
        except: pass
    return temp
def parseOutBody(f):
    from nltk.stem.snowball import SnowballStemmer
    import string
  
    

    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation).split()

        ### project part 2: comment out the line below
        #words = text_string

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        
        stemmer = SnowballStemmer('english')
        
        for word in text_string:
            word = word.strip()
            word = stemmer.stem(word)            
            words = words + ' ' + word
    else:
        pass


    return words
Example #17
0
 def __init__(self,df, column,n ): # gets the most frecuent words in a document
   
     texto = " ".join(str(x) for x in df[column].values)
     tokens = texto.split()
     tokens=[x.lower() for x in tokens]
     #stopset = set(stopwords.words('english')) # dictionary of stop words
     #tokens = [w for w in tokens if not w in stopset]
     stemmer=SnowballStemmer("english")
     stemm_words=[]
     tokens_clean=[]
     for j in tokens:
       
       sa=re.sub('[^A-Za-z]+', '', j)
       tokens_clean.append(sa)
     #print tokens_clean
     for s in tokens_clean:
       try:
         stem= stemmer.stem(s)
         if s!='':
          stemm_words.append(str(stem)) 
       except:
         pass
     cuenta = len(tokens_clean)
     largo =  Counter(stemm_words).most_common(n)
     topdic = dict(largo)
     asortado = Series(topdic)
     asortadol = asortado.columns = ['a', 'b']
     ordenado = asortado.order(ascending=False)
     ordenadolist= topdic.keys() #+stemm_words
     self.top=ordenadolist
Example #18
0
def clean_text(text):
    
    #remove numbers
    text = ''.join(i for i in text if not i.isdigit())
    
    #create bag of words
    tokenized = nltk.word_tokenize(text)
    
    #lower case
    lowercase = [word.lower() for word in tokenized]
    
    #load stopwords
    stopwords = pd.read_csv('.\\source_data\\stopwords.csv',encoding='latin1', header=None,names=['word'])
    stopwords = list(stopwords['word'])
    
    #remove stopwords    
    filtered_words = [word for word in lowercase if word not in stopwords]
    
    #remove punctuation
    punct = set(string.punctuation)
    filtered_words = [word for word in filtered_words if word not in punct]
    
    #stem text
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
              
    stemmed_text = ' '.join(stemmed_words)               
                     
    return stemmed_text     
def clean_single_word(word, lemmatizing="wordnet"):
    """
    Performs stemming or lemmatizing on a single word.

    If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing.

    Inputs: - word: A string containing the source word.
            - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet".

    Output: - lemma: The resulting clean lemma or stem.
    """
    if lemmatizing == "porter":
        porter = PorterStemmer()
        lemma = porter.stem(word)
    elif lemmatizing == "snowball":
        snowball = SnowballStemmer('english')
        lemma = snowball.stem(word)
    elif lemmatizing == "wordnet":
        wordnet = WordNetLemmatizer()
        lemma = wordnet.lemmatize(word)
    else:
        print("Invalid lemmatizer argument.")
        raise RuntimeError

    return lemma
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        and return a string that contains all the words
        in the email (space-separated)
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        """
    f.seek(0)
    all_text = f.read()

    ### split metadata off
    content = all_text.split("X-FileName:")
    words = ""
    st = ""
    if len(content) > 1:
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
        stemmer = SnowballStemmer("english")
#	print tree
    for word in text_string.split():
        st = st+" "+(stemmer.stem(word))

    words = st.lstrip()
    return words
def get_info_from_df(df, labeled=False, check_ratios=True, nostops=True, snowball=True, bigrams=True):
    """
    INPUT: DataFrame, boolean for presence of change_type labels,
           several optional parameters for text processing
    OUTPUT: text processed according to specified parameters,
            change_type labels (if specified), list of ratios (see above)
    """
    change_texts = []
    labels = []
    ratios = []
    for i, row in df.iterrows():
        for j, text in enumerate(row["text_no_abi"]):
            change_texts.append(text.lower().replace("\n", ""))
            if labeled:
                labels.append(row["change_type"][j])

    if check_ratios:
        change_texts, ratios = add_ratios(change_texts)
    if nostops:
        stops = set(stopwords.words("english"))
        change_texts = [" ".join([word for word in text.split(" ") if word not in stops]) for text in change_texts]
    if snowball:
        snowball = SnowballStemmer("english")
        change_texts = [" ".join([snowball.stem(word) for word in text.split(" ")]) for text in change_texts]
    if bigrams:
        change_texts = add_bigrams(change_texts)

    if labeled:
        return change_texts, ratios, labels
    else:
        return change_texts, ratios
Example #22
0
def clean_text(list_o_text):
    docs = [''.join([char if char not in punctuation else ' ' for char in 
                     comic]) for comic in list_o_text]

    # remove punctuation from string
    docs = [word_tokenize(comic) for comic in docs]
    # make string into list of words

    # 3. Strip out stop words from each tokenized document.
    stop = set(stopwords.words('english'))
    stop.update(punctuation)
    other_words = ['cite', 'cite_note', 'cite_ref', 'class', 'href', 'id', 
                   'redirect', 'ref', 'refer', 'span', 'sup', 'title', 'wiki']
    stop.update(other_words)
    docs = [[word for word in words if word.strip(punctuation) not in stop] 
            for words in docs]
    # remove stop words
    
    # Stemming / Lemmatization
    # 1. Stem using both stemmers and the lemmatizer
    #porter = PorterStemmer()
    snowball = SnowballStemmer('english')
    #wordnet = WordNetLemmatizer()
    #docs_porter = [[porter.stem(word) for word in words] for words in docs]
    docs_snowball = [[snowball.stem(word) for word in words] for words in docs]
    #docs_wordnet = [[wordnet.lemmatize(word) for word in words] for words in docs]
    docs = [' '.join(doc) for doc in docs_snowball]
    # for each document, it becomes a long string
    return docs
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems
def clean_data(data):
    '''
    Stems and removes stop words from training and test data
    '''
    stemmer = SnowballStemmer('english')
    stop = stopwords.words('english')
    for column_name in ['query', 'product_title', 'product_description']:
        for index, row in data.iterrows():
            warnings.filterwarnings('error')
            try:
                extracted_data = (' ').join(
                    [i for i in BeautifulSoup(row[column_name], 'lxml')
                    .get_text(' ')
                    .split(' ')
                    ])
            except UserWarning:
                pass
            cleaned_data = re.sub('[^a-zA-Z0-9]',' ', extracted_data)
            stemmed_data = (' ').join(
                [stemmer.stem(i) for i in cleaned_data.split(' ')
                ])
            remove_stop_words = ('').join(
                [i for i in stemmed_data if i not in stop]
                )
            data.set_value(index, column_name, unicode(remove_stop_words))
    return data
Example #25
0
def get_core_words( text ):

    #TOKENIZATION
    b = word_tokenize(text)

    #KEEP ONLY NOUNS
    b = [noun for noun, pos in pos_tag(b) if pos.startswith('N')]

    #CONVERT INTO LOWER CASE
    looper = 0
    for token in b:
        b[looper] = token.lower()
        looper+=1

    #REMOVE THE STOPWORDS FROM THE FILE
    minlength = 2
    c = [token for token in b if (not token in stopwords.words('english')) and len(token) >= minlength]

    #STEMMING THE WORDS TO ITS BASE FORM
    stemmer = SnowballStemmer("english")
    looper1 = 0
    for token in c:
        c[looper1] = stemmer.stem(token.decode("utf8"))
        looper1 +=1
    return c
Example #26
0
def processFile(fh):
    with gzip.open(fh, 'rb') as f:
        tree = etree.parse(f)
        root = tree.getroot()        
        r = re.compile('^[a-zA-Z]+$')
        s = SnowballStemmer("english")

        paragraphs = root.xpath('DOC[@type="story"]/TEXT/P')        
        
        for p in paragraphs:            
            try:
                sentences = PunktSentenceTokenizer().sentences_from_text(p.text)

                for sentence in sentences:                
                    tokens = TreebankWordTokenizer().tokenize(sentence)

                    #Filter by alphabetic only
                    alphabetic = filter(r.match, tokens)
                    #Filter by stopwords & stem all leftover tokens
                    stop_filtered = [s.stem(w) for w in alphabetic if w.lower() not in stopwords.words('english')]

                    print (" ").join(stop_filtered).upper()
            except:
                continue        


    return True
Example #27
0
def prune(doc, stoplist = None, stem = True, english_dictionary_words = False):
    """This takes a single document and tokenizes the words, removes
    undesirable elements, and prepares it to be loaded into a dictionary.
    """
    # Tokenize the document and make it lowercase
    temp = utils.simple_preprocess(doc.lower())

    # Remove freestanding punctuation and punctuation in words
    temp = [w for w in temp if w not in string.punctuation]
    temp = [rmPunct(w) for w in temp]

    # Remove words in passed stoplist
    if stoplist:
        temp = [w for w in temp if w not in stoplist]

    # Remove specific tokens
    temp = [w for w in temp if w not in set(['[', ']', "'", '\n', 'com'])]

    # Remove stopwords
    temp = [w for w in temp if w not in stopwords.words('english')]

    # Stem the remaining words
    if stem:
        stemmer = SnowballStemmer('english')
        temp = [stemmer.stem(w) for w in temp]

    if english_dictionary_words:
        d = enchant.Dict("en_US")
        temp = [w for w in temp if d.check(w)]
    return temp
def read_corpus(corpus_file, use_sentiment):
	"Reads in the corpus and returns the documents and labels"
	documents = []
	labels = []
	with open(corpus_file, encoding='utf-8') as f:
		for line in f:
			tokens = line.strip().split()
			use_stopword = False
			if use_stopword:
				stopwordfile = open('stopwords.txt', 'r')
				stopwords = []
				for line in stopwordfile:
					if len(line) > 0:
						splitline = line.split(',')
						for word in splitline:
							stopwords.append(word)

				tokenlist = [token for token in tokens[3:] if token not in stopwords]
				documents.append(find_ngrams(tokenlist, 2))
			else:
				snowballstemmer = SnowballStemmer('english')
				stemmedtokens = [snowballstemmer.stem(word) for word in tokens[3:]]
				#documents.append(stemmedtokens)
				documents.append(find_ngrams(stemmedtokens, 2))
			if use_sentiment:
				# 2-class problem: positive vs negative
				labels.append( tokens[1] )
			else:
				# 6-class problem: books, camera, dvd, health, music, software
				labels.append( tokens[0] )

	return documents, labels
Example #29
0
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """

    stemmer = SnowballStemmer("english")
    
    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        
        words = ' '.join([stemmer.stem(word) for word in text_string.split()])
        
    return words
def cleaned_bag_of_words_dataset(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None,
                                 length=False, number_in_tweet=False, words_present=[]):
    if stemming:
        stemmer = SnowballStemmer("english")
        tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix]
    else:
        tweets = [data_point[2].lower() for data_point in data_matrix]
        
    if TFIDF:
        vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features)
    else:
        vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features)
    
    dataset = vectorizer.fit_transform(tweets).toarray()
    
    if length:
        lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, lengths), axis=1)
     
    if number_in_tweet:
        numbers = []
        for data_point in data_matrix:
            number_list = list_of_ints_from_string(data_point[2])
            filtered_number_list = [number for number in number_list if abs(number) < 10]
            if len(filtered_number_list) == 0:
                numbers.append([0])
            else:
                numbers.append([np.mean(filtered_number_list)])
        dataset = np.concatenate((dataset, numbers), axis=1)

    for word in words_present:
        word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, word_present), axis=1)
        
    return dataset
Example #31
0
def show_entry_fields():
    url = 'http://api.hh.ru/vacancies?text=' + (
        e1.get()) + '&page=0&per_page=100'
    data = requests.get(url).json()
    print("Поиск вакансий")
    p = json.dumps(data)
    res2 = json.loads(p)
    i = 0
    texts = []
    total_word = []
    window = tk.Toplevel(root)
    window.minsize(1300, 1000)
    window.title(u"Вывод данных")
    #webbrowser.open("index.html")
    w00 = Label(window, text=u"ВАКАНСИИ", font="Times")
    w00.place(relx=0.2, rely=0.01)
    t1 = Text(window, height=60, width=75)
    t1.place(relx=0.01, rely=0.03)
    w11 = Label(window, text=u"НАПИСАТЬ СОПРОВОДИТЕЛЬНОЕ ПИСЬМО", font="Times")
    w11.place(relx=0.64, rely=0.57)
    t2 = Text(window, height=20, width=70)
    t2.place(relx=0.52, rely=0.6)
    while i < len(res2['items']):
        a = ((res2['items'][i]['id']))  #['requirement']
        #print (a)
        #print ((res2['items'][i]['name']))
        aa = ((res2['items'][i]['snippet']['requirement']))
        #aa=(res2['items'][i]['snippet']['requirement']).replace('<highlighttext>', '')
        #patt = re.compile('(\s*)aa(\s*)')
        print(aa)

        texts.append(aa)
        #wordpunct_tokenize(str(aa))
        tokenizer = RegexpTokenizer(r'\w+')
        #print (stopwords.words('english'))
        (total_word.extend(tokenizer.tokenize(str(aa))))

        aaa = str(i + 1) + ') ' + str(res2['items'][i]['name']) + ' | ' + str(
            res2['items'][i]['area']['name']) + '\n'
        t1.insert(END, (aaa))
        i = i + 1

    #----------------------------------------------------------------------формирование окна выдачи результатов
    stopwords = nltk.corpus.stopwords.words('english')
    en_stop = get_stop_words('en')
    stemmer = SnowballStemmer("english")
    #print stopwords[:10]

    #--------------------------------------------------------------------------скрытое размещение дирихле
    #w8=Label(window,text=u"ОСНОВНЫЕ ТЕМЫ И СЛОВА", font = "Times")
    #w8.place(relx=0.17, rely=0.53)
    #t8=Text(window, height=24, width=75)
    #t8.place(relx=0.01, rely=0.57)
    texts = []
    stopped_tokens = [i for i in total_word if not i in en_stop]
    #print le(stopped_tokens)
    p_stemmer = PorterStemmer()
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    #print len(stemmed_tokens), stemmed_tokens
    texts.append(stemmed_tokens)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.LdaModel(corpus,
                                      num_topics=100,
                                      id2word=dictionary,
                                      passes=20)
    a = ldamodel.print_topics(num_topics=10, num_words=7)
    #print ldamodel.print_topics(num_topics=4, num_words=7)[0][1]
    #print a
    num_topics = 5
    topic_words = []
    for i in range(num_topics):
        tt = ldamodel.get_topic_terms(i, 10)
        topic_words.append([dictionary[pair[0]] for pair in tt])
    #print topic_words[0]
    jj = 0
    while jj < len(topic_words):
        topic11 = ((u"Тема #%d:" % (jj + 1)) + "\n" +
                   "-".join(topic_words[jj]) + "\n")
        #t8.insert(END, topic11)
        #print(u"Тема #%d:" % (jj+1))
        #print("-".join(topic_words[jj]))
        jj = jj + 1
    #--------------------------------------------------------------------------определение основных компетенций
    vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=.5)
    tfv = vec.fit_transform(stopped_tokens)
    terms = vec.get_feature_names()
    result = list(set(list_skills) & set(terms))
    print(result)
    text_file = open("Output.txt", "w")
    text_file.write(result[2])
    text_file.close()
    wc = WordCloud(height=1000, width=1000,
                   max_words=1000).generate(" ".join(terms))
    nmf = NMF(n_components=11).fit(tfv)
    #for idx, topic in enumerate(nmf.components_):
    #print(u"Тема #%d:" % (idx+1))
    #print(" ".join([terms[i] for i in topic.argsort()[:-10 - 1:-1]]))
    #--------------------------------------------------------------------------рисунок распределения терминов
    w8 = Label(window, text=u"РАСПРЕДЕЛЕНИЕ НАВЫКОВ", font="Times")
    w8.place(relx=0.66, rely=0.01)
    fig = plt.figure(figsize=(5, 5))
    im = plt.imshow(wc)
    canvas = FigureCanvasTkAgg(fig, master=window)
    canvas.show()
    canvas.get_tk_widget().place(
        relx=0.54, rely=0.03)  #pack(side=TOP, fill=BOTH, expand=1)
    canvas._tkcanvas.place(relx=0.52,
                           rely=0.03)  #pack(side=TOP, fill=BOTH, expand=1)
    #--------------------------------------------------------------------------оцека тональности
    c = Button(window,
               text=u"Подтвердить квалификацию",
               font="Times 14 bold",
               command=scoring,
               bg="deep sky blue")
    c.place(relx=0.95, rely=0.97, anchor=SE)
    c1 = Button(window,
                text=u"Откликнуться",
                font="Times 14 bold",
                command=testing,
                bg="lime green")
    c1.place(relx=0.7, rely=0.97, anchor=SE)
Example #32
0
    "King Sihanouk declined to chair talks in either place.",
    "A U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.2",
    "But in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.",
    "Left out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians."
]

sents = [
    "Budget negotiations between the White House and House Republicans were delayed on several issues.",
    "At issue were provisions that included requiring Federal Health Insurance providers to provide contraceptives to women as Well as a provision to build a road across a wildlife preserve in Alaska.",
    "The contraceptive issue faced an uncertain future while Clinton likely will veto the road.",
    "There is disagreement also on how to spend the funding on education.",
    "This year's budget discussions also have been hampered because it is the first time since budget procedures were established in 1974 that there has been a surplus, preventing agreement on a budget resolution."
]

sentences = parser.raw_parse_sents(sents)
language = 'english'
stemmer = SnowballStemmer(language)
stoplist = set(stopwords.words(language))

for sent in sentences:
    phrases = []
    parsestr = unicode(list(sent)[0])
    #print 'Sent:', parsestr
    tokens = Tree.fromstring(parsestr).leaves()
    print tokens
    hash_pos_tokens, phrases = get_parse_info(parsestr, stemmer, language,
                                              stoplist)
    check = prune_phrases(phrases, stoplist, stemmer, language)
    for x in check:
        print(unicode(x))
    print('No. of phrases:', len(check))
import emoji
import tweepy

# Original Working Directory
owd = os.getcwd()


# Twitter App Credentials
consumer_key = "N6EHubErC6jwd5eDqDBoJ3iW1"
consumer_secret = "oOmJLOmlaEk7bR6R6KYsFguS2yeRmducUOKIWGZ8wmRuQ70nB0"
access_key = "598773124-jLVnqszY1MMYDbeD1vjBYeq6rx5O2QxCxtCm3IFM"
access_secret = "URaUh3VzdJJ6jgLejtny5U4I5uo4wlKGCpDgOwANn0ixZ"


en_stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def write_csv(file_name):
    with open(file_name, 'w') as f:
        writer = csv.writer(f)


def get_tweets(searchQuery, lang, tweets_max=1000):
	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_key, access_secret)
	api = tweepy.API(auth)
	maxTweets = tweets_max # Some arbitrary large number
	tweetsPerQry = 100  # this is the max the API permits
	fName = 'emirates_mentioned_tweets' # We'll store the tweets in a CSV file.
	sinceId = None
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from imblearn.over_sampling import SMOTE
from collections import Counter
import re
import nltk
import csv
from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
from nltk.corpus import stopwords  # Import the stop word list


def extract_numbers(tags):
    tags_only = re.sub("[^0-9,]", '', tags).split(',')
    tagsList = map(int, tags_only)
    return tagsList


def getSolution(list1, list2, list3, list4):
    list1 = list(set(extract_numbers(list1)))
    list2 = list(set(extract_numbers(list2)))
    list3 = list(set(extract_numbers(list3)))
    list4 = list(set(extract_numbers(list4)))
    list1 = Counter(list1)
    list2 = Counter(list2)
Example #35
0
from gensim.utils import tokenize
from nltk.stem.snowball import SnowballStemmer

from utils import compose

preprocessors = [
    lambda text: list(tokenize(text)),
    lambda g: (SnowballStemmer("russian", True).stem(g_) for g_ in g),
]
stop_words = []


def rm_stop_words(text):
    return [token for token in text if token not in stop_words]


def preprocess(texts):
    return (compose(*preprocessors)(text) for text in texts)


if __name__ == "__main__":
    from argparse import ArgumentParser
    from read import read
    from pprint import pprint

    parser = ArgumentParser()
    parser.add_argument("-i", dest="path", type=str, help="path to text")
    args = parser.parse_args()
    pprint(list(preprocess(read(args.path))))
Example #36
0
from nltk.tokenize import word_tokenize
import nltk

from collections import Counter
import string

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_list = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


def remove_punc(tokens):
    clean_tokens = []
    for tok in tokens:
        if tok not in string.punctuation:
            if tok != "''" and tok != '``' and tok != "'s":
                clean_tokens.append(tok)
    return clean_tokens


def remove_stopwords(tokens):
    tokens_clean = []
    for tok in tokens:
        if tok not in stopword_list:
            tokens_clean.append(tok)
Example #37
0
class TextSummarizer:
    #ps = PorterStemmer()
    stemmer = SnowballStemmer("english")
    stopWords = set(stopwords.words("english") + list(punctuation))
    text = ""
    sentences = ""

    def tokenize_sentence(self):
        words = word_tokenize(self.text)
        print(words)
        return words

    def input_text(self):

        while True:
            #self.text = input("Enter the text to summarize\n")
            #with open('text_input.docx','r',encoding='utf-8') as f:
            #     inp = StringIO(f.read())
            document = Document('crow.docx')
            #self.text = document.read()
            self.text = []
            for para in document.paragraphs:
                self.text.append(para.text)

            self.text = str(self.text)

            if (len(self.text) > 10):
                break
            else:
                print("Please input the text as length at least 10")

    def cal_freq(self, words):

        # Second, we create a dictionary for the word frequency table.

        freqTable = dict()
        for word in words:
            word = word.lower()
            if word in self.stopWords:
                continue
            #word = stemmer.stem(word)

            if word in freqTable:
                freqTable[word] += 1
            else:
                freqTable[word] = 1
        return freqTable

    def compute_sentence(self, freqTable):

        self.sentences = sent_tokenize(self.text)
        sentenceValue = dict(
        )  # dict() creates the dictionary with key and it's corresponding value

        for sentence in self.sentences:

            for index, wordValue in enumerate(freqTable, start=1):

                if wordValue in sentence.lower():  # index[0] return word

                    if sentence in sentenceValue:

                        sentenceValue[
                            sentence] += index  # index return value of occurence of that word
                        #sentenceValue.update({sentence: index})
                        #print(sentenceValue)
                    else:

                        # sentenceValue[sentence] = wordValue
                        sentenceValue[sentence] = index
                        #print(sentenceValue)

        print(sentenceValue)
        return sentenceValue

    def sumAvg(self, sentenceValue):
        sumValues = 0
        for sentence in sentenceValue:

            sumValues += sentenceValue[sentence]

        # Average value of a sentence from original text
        average = int(sumValues / len(sentenceValue))

        return average

    def print_summary(self, sentenceValue, average):
        summary = ''
        for sentence in self.sentences:
            if (sentence in sentenceValue) and (sentenceValue[sentence] >
                                                (1.5 * average)):
                summary += " " + sentence

        #print(summary)

        return summary
        'type': dataset.pre_clean_len.dtype,
        'description': 'Length of the text before cleaning'
    },
    'dataset_shape': dataset.shape
}
pprint(data_dict)

count = 0

from nltk import PorterStemmer

count = 0
stop = stopwords.words('english')
stop.remove('not')
stop.remove('against')
mystemmer = SnowballStemmer("english")
length = len(textData)
prev = -1
done_count = 0
import time as tm
startingTime = tm.time()
print("Starting Time is", startingTime)
for i in textData:

    done_count += 1
    done_percent = (int)(100 * done_count / length)
    if done_percent != prev:
        prev = done_percent
        print("DONE = ", done_percent, "%.", sep='')

    textData[count] = cleanText(i)
Example #39
0
import nltk
from nltk import stem
from nltk.stem.snowball import SnowballStemmer
from tflearn.layers.core import activation
stemmer = SnowballStemmer(language='swedish')

import numpy
import tflearn
import tensorflow
import random
import json
import pickle

#Get data
with open("data/intents-mdh.json", encoding='utf-8') as file:
    data = json.load(file)

try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)
except:
    words = []
    labels = []
    docs_x = []
    docs_y = []
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            docs_x.append(wrds)
            docs_y.append(intent["tag"])
Example #40
0
import re, os, json, operator, functools
import numpy as np
from collections import Counter, deque

from nltk.stem.snowball import FrenchStemmer, SnowballStemmer
from nltk.tokenize import WordPunctTokenizer

# sentenceTokenizer = nltk.data.load('tokenizers/punkt/PY3/french.pickle')
stemmer = SnowballStemmer("french")
tokenizer = WordPunctTokenizer()

# stemmer = FrenchStemmer()


# Takes a string and return a list of words
def clean_text(text, stem=False):
    # Removing curly braces, those are metadata in the corpus
    text = re.sub(r'\{.*}', '', text)
    # Remove x2, x3 etc. (repeating verse annotation)
    text = re.sub(r'(x|X)\d+', '', text)
    # Replacing purely stylistics chars
    text = re.sub(r'æ', 'ae', text)
    text = re.sub(r'œ', 'oe', text)
    text = re.sub(r'[ìíîï]', 'i', text)
    text = re.sub(r'[ýÿ]', 'y', text)
    text = re.sub(r'[òóôõö]', 'o', text)
    text = re.sub(r'[áâãä]', 'a', text)
    text = re.sub(r'ë', 'e', text)
    text = re.sub(r'ñ', 'n', text)
    text = re.sub(r'[ûü]', 'u', text)
    text = re.sub(r'[«“”»]', '"', text)
Example #41
0
# https://www.nltk.org/api/nltk.stem.html

import nltk
from nltk.stem.snowball import SnowballStemmer
print(" ".join(SnowballStemmer.languages))
stemmer = SnowballStemmer("finnish")

word = "lumipallojakaan"
print(word)
print(stemmer.stem(word))

class TweetProcessor:
    """
    Input:
        Array of lines made from tweets in json format

    Attributes:
        data: Dataframe of tweets

    Proccedures:
        1. Removing all extra whitespaces
        2. Change the text to lowercase
        3. Remove non-alphabetical characters
        4. Remove tweet duplicates

    Output: Cleaned Dataframe of tweets
    """

    levenshtein_distance = 20
    stemmer = SnowballStemmer("english")

    def __init__(self, data):
        self.data = data

    def __remove_whitespaces(self):
        """ Removes all the tralling whitespaces """
        self.data['text'] = map(
            lambda tweet: re.sub('\s+', ' ', tweet).strip(), self.data['text'])
        print("Removed whitespaces")

    def __lowercase(self):
        """ Changes to lowercase the data """
        self.data['text'] = map(lambda tweet: tweet.lower(), self.data['text'])
        print("Lowercased")

    def __filter_alphabetic(self):
        """ Remove all non alphabetical characters """
        self.data['text'] = map(lambda tweet: tweet.encode('ascii', 'ignore'),
                                self.data['text'])
        print("Filtered alphabetic")

    def __filter_duplicates(self):
        """ Remove all duplicates from the file by applying Leveshtien Distance to the string"""
        duplicates = set()
        for i, a in enumerate(self.data['text']):
            os.system('clear')
            print("Filtered: " + str(100 * i / len(self.data['text'])) + " %")
            for j, b in enumerate(self.data['text']):
                if (i != j and lv.distance(a, b) < self.levenshtein_distance):
                    duplicates.add(j + 1)
        self.data = self.data.drop(duplicates, errors='ignore')
        self.data = self.data.reset_index(drop=True)
        print("Filtered duplicates")

    def __stem_data(self):
        """ Apply steeming to data """
        self.data['stemmed'] = self.data["text"].apply(lambda tweet: " ".join(
            [self.stemmer.stem(word) for word in tweet.split(" ")]))
        print("Stemmed")

    def process_data(self):
        self.__remove_whitespaces()
        self.__lowercase()
        self.__filter_alphabetic()
        self.__filter_duplicates()
        self.__stem_data()

        return self.data
def main():
    if len(sys.argv) < 2:
        print('error: too few arguments')
        print('command:  python create_category_corpus.py NUMBER_TOP_CATEGORY')
        quit()

    NUMBER_TOP_CATEGORY = int(sys.argv[1])
    print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY))

    print('loading category profiles')
    profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz')
    print('finish loading category profiles')

    system_flag = platform.system()
    cwd = os.getcwd()

    # initialize mongo client
    if system_flag == 'Windows':
        client = pymongo.MongoClient("localhost", 27017)
    else:
        client = pymongo.MongoClient("localhost", 58903)

    db = client.wiki2015
    wiki_article_categories = db['article_categories']

    category_corpus = {}

    pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % (
        NUMBER_TOP_CATEGORY)
    if system_flag == 'Windows':
        lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3',
                                            'BM25', True)
    else:
        lucene_dbpedia_fsdm = Lucene_Object(
            '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True)

    cnt = 0
    if os.path.exists(pkl_filename) == True:
        #if False==True:
        print('loading category corpus')
        category_corpus = load_zipped_pickle(pkl_filename)
    else:

        for item in wiki_article_categories.find():
            list_category = item['categories'].strip().split('|')
            uri_article = item['uri']
            title = findTitle(uri_article)

            entity_content_dict = {}
            doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex(
                title, 'title', False)
            if doc_entity is None:
                continue

            for f in [
                    'names', 'attributes', 'categories', 'similar_entities',
                    'related_entities', 'catchall'
            ]:
                entity_content_dict[f] = doc_entity[f]
                entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' +
                                                                 f]

            if len(entity_content_dict['catchall'].strip()) == 0:
                continue

            for cat in list_category[:NUMBER_TOP_CATEGORY]:
                if ('<http://dbpedia.org/resource/Category:' + cat +
                        '>') not in profile:
                    continue
                if cat not in category_corpus:
                    category_corpus[cat] = []
                if len(category_corpus[cat]) < 300:
                    category_corpus[cat].append(entity_content_dict)

            #cnt+=1
            #if cnt>20:
            #break

        print('saving corpus to pkl.gz')
        save_zipped_pickle(category_corpus, pkl_filename)
    client.close()

    # begin write the data into index
    print('begin write into index')
    if system_flag == 'Windows':
        LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str(
            NUMBER_TOP_CATEGORY) + '_fsdm3'
    else:
        LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % (
            cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3'

    # backup code files
    cmd = 'robocopy %s %s\code_files *.py' % (
        r'%cd%', LUCENE_INDEX_DIR
    ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % (
        LUCENE_INDEX_DIR)
    os.system(cmd)

    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)

    # write data to index
    w = IndexWriter(index_mm, config)

    cnt = 0
    data = {}
    max_article_num = 0
    stemmer = SnowballStemmer('english')
    for cat, list_entity_dict in category_corpus.items():
        cat_label = cleanSentence(cat, True)
        data.clear()
        data['category'] = (cat, 'StringField')
        data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT')
        data['stemmed_label'] = (stemSentence(cat_label, stemmer,
                                              True), 'CUSTOM_FIELD_TEXT')
        data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED')

        if data['num_articles'][0] > max_article_num:
            max_article_num = data['num_articles'][0]

        for f in [
                'names', 'attributes', 'categories', 'similar_entities',
                'related_entities', 'catchall'
        ]:
            contents = cleanSentence(
                ' '.join([dic[f] for dic in list_entity_dict]), True, ' ')
            data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED')
            data['stemmed_' + f] = (stemSentence(contents, stemmer, False),
                                    'CUSTOM_FIELD_TEXT_NOT_STORED')
        #print ('--------------------')
        # need to calculate corpus average length
        addDoc(w, data)

        #cnt+=1
        #if cnt>20:
        #break

    w.close()
    print('max article num=%d' % (max_article_num))
Example #44
0
import telebot
import requests
from bs4 import BeautifulSoup
import re
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("russian")


def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


@bot.message_handler(commands=['donate'])
def donate(message):
    bot.send_message(message.chat.id, "На поддержку проекта")


@bot.message_handler(commands=['start'])
def send_welcome(message):
    try:
        global k
        bot.send_message(
            message.chat.id, "Отправьте мне ссылку на RSS ленту: " + "\n\n" +
            "Список популярных RSS лент: " + "\n" + "Лента ру - /LentaRu" +
            "\n" + "СПОРТ сегодня - /SportToday" + "\n" +
            "Travel ru -  /TravelRu")
        k = 0
    except:
def cli_main():
    # parser = argparse.ArgumentParser(description=metrics_description, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser = argparse.ArgumentParser(description="predictor")
    # parser.add_argument('--config-file', type=str, help='config file with metric parameters')
    # parser.add_argument('--metrics', type=str, help='comma-separated string of metrics')
    # parser.add_argument('--aggregate', type=bool, help='whether to aggregate scores')
    # parser.add_argument('--jsonl-file', type=str, help='input jsonl file to score')
    # parser.add_argument('--article-file', type=str, help='input article file')
    # parser.add_argument('--summ-file', type=str, help='input summary file')
    # parser.add_argument('--ref-file', type=str, help='input reference file')
    # parser.add_argument('--output-file', type=str, help='output file')
    # parser.add_argument('--eos', type=str, help='EOS for ROUGE (if reference not supplied as list)')
    # args = parser.parse_args()
    args = default_args(parser=parser)

    # =====================================
    # INITIALIZE METRICS
    gin.parse_config_file(args.config_file)
    toks_needed = set()
    metrics = [x.strip() for x in args.metrics.split(",")]
    metrics_dict = {}
    if "rouge" in metrics:
        from summ_eval.rouge_metric import RougeMetric
        metrics_dict["rouge"] = RougeMetric()
        toks_needed.add("line_delimited")

    if "bert_score" in metrics:
        from summ_eval.bert_score_metric import BertScoreMetric
        bert_score_metric = BertScoreMetric()
        metrics_dict["bert_score"] = bert_score_metric
        toks_needed.add("space")
    if "mover_score" in metrics:
        from summ_eval.mover_score_metric import MoverScoreMetric
        mover_score_metric = MoverScoreMetric()
        metrics_dict["mover_score"] = mover_score_metric
        toks_needed.add("space")
    if "chrf" in metrics:
        from summ_eval.chrfpp_metric import ChrfppMetric
        metrics_dict["chrf"] = ChrfppMetric()
        toks_needed.add("space")
    if "meteor" in metrics:
        from summ_eval.meteor_metric import MeteorMetric
        metrics_dict["meteor"] = MeteorMetric()
        toks_needed.add("space")
    if "bleu" in metrics:
        from summ_eval.bleu_metric import BleuMetric
        metrics_dict["bleu"] = BleuMetric()
        toks_needed.add("space")
    if "cider" in metrics:
        from summ_eval.cider_metric import CiderMetric
        metrics_dict["cider"] = CiderMetric()
        toks_needed.add("stem")

    if "s3" in metrics:
        from summ_eval.s3_metric import S3Metric
        metrics_dict["s3"] = S3Metric()
        toks_needed.add("stem")
    if "rouge_we" in metrics:
        from summ_eval.rouge_we_metric import RougeWeMetric
        metrics_dict["rouge_we"] = RougeWeMetric()
        toks_needed.add("stem")

    if "stats" in metrics:
        from summ_eval.data_stats_metric import DataStatsMetric
        metrics_dict['stats'] = DataStatsMetric()
        toks_needed.add("spacy")
    if "sms" in metrics:
        from summ_eval.sentence_movers_metric import SentenceMoversMetric
        metrics_dict['sms'] = SentenceMoversMetric()
        toks_needed.add("spacy")
    if "summaqa" in metrics:
        from summ_eval.summa_qa_metric import SummaQAMetric
        metrics_dict['summaqa'] = SummaQAMetric()
        toks_needed.add("spacy")
        toks_needed.add("space")
    if "syntactic" in metrics:
        from summ_eval.syntactic_metric import SyntacticMetric
        metrics_dict["syntactic"] = SyntacticMetric()
        toks_needed.add("space")
    if "supert" in metrics:
        from summ_eval.supert_metric import SupertMetric
        metrics_dict['supert'] = SupertMetric()
        toks_needed.add("space")
    if "blanc" in metrics:
        from summ_eval.blanc_metric import BlancMetric
        metrics_dict['blanc'] = BlancMetric()
        toks_needed.add("space")
    # =====================================

    # =====================================
    # READ INPUT
    print("Reading the input")
    ids = []
    articles = []
    references = []
    summaries = []
    bad_lines = 0
    if args.jsonl_file is not None:
        try:
            with open(args.jsonl_file) as inputf:
                for count, line in enumerate(inputf):
                    try:
                        data = json.loads(line)
                        try:
                            ids.append(data['id'])
                        except:
                            pass
                        if len(data['decoded']) == 0:
                            bad_lines += 1
                            continue
                        summaries.append(data['decoded'])
                        # references.append(data['reference'])
                        if data.get("reference", None):
                            references.append(data['reference'])
                        else:  # there are 10 additional references added, the first is the orginal
                            references.append(data["references"][0])
                        # if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics:
                        # remove stats
                        if "summaqa" in metrics  or "supert" in metrics or "blanc" in metrics:
                            try:
                                articles.append(data['text'])
                            except:
                                raise ValueError("You specified summaqa and stats, which" \
                                                 "require input articles, but we could not parse the file!")
                    except:
                        bad_lines += 1
        except Exception as e:
            print("Input did not match required format")
            print(e)
            sys.exit()
        print(f"This many bad lines encountered during loading: {bad_lines}")

    if args.summ_file is not None:
        with open(args.summ_file) as inputf:
            summaries = inputf.read().splitlines()
    if args.ref_file is not None:
        with open(args.ref_file) as inputf:
            references = inputf.read().splitlines()
    # if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics:
    if "summaqa" in metrics  or "supert" in metrics or "blanc" in metrics:
        if args.article_file is None and len(articles) == 0:
            raise ValueError("You specified summaqa and stats, which" \
                             "require input articles, but we could not parse the file!")
        if len(articles) > 0:
            pass
        else:
            with open(args.article_file) as inputf:
                articles = inputf.read().splitlines()
    if len(ids) == 0:
        ids = list(range(0, len(summaries)))
    # =====================================

    # =====================================
    # TOKENIZATION
    print("Preparing the input")
    references_delimited = None
    summaries_delimited = None
    if len(references) > 0:
        if isinstance(references[0], list):
            if "line_delimited" in toks_needed:
                references_delimited = ["\n".join(ref) for ref in references]
            if "space" in toks_needed:
                references_space = [" ".join(ref) for ref in references]
        elif args.eos is not None:
            if "line_delimited" not in toks_needed:
                raise ValueError('You provided a delimiter but are not using a metric which requires one.')
            if args.eos == "\n":
                references_delimited = [ref.split(args.eos) for ref in references]
            else:
                references_delimited = [f"{args.eos}\n".join(ref.split(args.eos)) for ref in references]
        elif "line_delimited" in toks_needed:
            references_delimited = references
        if "space" in toks_needed:
            references_space = references

    if isinstance(summaries[0], list):
        if "line_delimited" in toks_needed:
            summaries_delimited = ["\n".join(summ) for summ in summaries]
        if "space" in toks_needed:
            summaries_space = [" ".join(summ) for summ in summaries]
    elif args.eos is not None:
        if "line_delimited" not in toks_needed:
            raise ValueError('You provided a delimiter but are not using a metric which requires one.')
        if args.eos == "\n":
            summaries_delimited = [ref.split(args.eos) for ref in summaries]
        else:
            summaries_delimited = [f"{args.eos}\n".join(ref.split(args.eos)) for ref in summaries]
    elif "line_delimited" in toks_needed:
        summaries_delimited = summaries
    if "space" in toks_needed:
        summaries_space = summaries

    if "stem" in toks_needed:
        tokenizer = RegexpTokenizer(r'\w+')
        stemmer = SnowballStemmer("english")
        if isinstance(summaries[0], list):
            summaries_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(" ".join(summ))] for summ in
                                 summaries]
            references_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(" ".join(ref))] for ref in
                                  references]
        else:
            summaries_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(summ)] for summ in summaries]
            references_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(ref)] for ref in references]
        summaries_stemmed = [" ".join(summ) for summ in summaries_stemmed]
        references_stemmed = [" ".join(ref) for ref in references_stemmed]

    if "spacy" in toks_needed:
        nlp = spacy.load('en_core_web_sm')
        # nlp = spacy.load('en_core_web_md')
        disable = ["tagger", "textcat","lemmatizer"]
        if "summaqa" not in metrics:
            disable.append("ner")
        if isinstance(summaries[0], list):
            summaries_spacy = [nlp(" ".join(text), disable=disable) for text in summaries]
        else:
            summaries_spacy = [nlp(text, disable=disable) for text in summaries]
        if "stats" in metrics:
            summaries_spacy_stats = [[tok.text for tok in summary] for summary in summaries_spacy]
        if "sms" in metrics:
            if isinstance(references[0], list):
                references_spacy = [nlp(" ".join(text), disable=disable) for text in references]
            else:
                references_spacy = [nlp(text, disable=disable) for text in references]
            # this is original for summaqa and stats
        # if "summaqa" in metrics or "stats" in metrics:
        #     if isinstance(articles[0], list):
        #         input_spacy = [nlp(" ".join(text), disable=disable) for text in articles]
        #     else:
        #         input_spacy = [nlp(text, disable=disable) for text in articles]
        #     if "stats" in metrics:
        #         input_spacy_stats = [[tok.text for tok in article] for article in input_spacy]
        # use reference as article for stats
        if "summaqa" in metrics or "stats" in metrics:
            if isinstance(references[0], list):
                input_spacy = [nlp(" ".join(text), disable=disable) for text in references]
            else:
                input_spacy = [nlp(text, disable=disable) for text in references]
            if "stats" in metrics:
                input_spacy_stats = [[tok.text for tok in ref] for ref in input_spacy]
    if "supert" in metrics or "blanc" in metrics:
        inputs_space = articles
    # =====================================

    # =====================================
    # GET SCORES
    if args.aggregate:
        final_output = dict()
    else:
        final_output = defaultdict(lambda: defaultdict(int))
    # import pdb;pdb.set_trace()
    for metric, metric_cls in metrics_dict.items():
        print(f"Calculating scores for the {metric} metric.")
        try:
            if metric == "rouge":
                output = metric_cls.evaluate_batch(summaries_delimited, references_delimited, aggregate=args.aggregate)
                # only rouge uses this input so we can delete it
                del references_delimited
                del summaries_delimited
            elif metric in ('bert_score', 'mover_score', 'chrf', 'meteor', 'bleu'):
                output = metric_cls.evaluate_batch(summaries_space, references_space, aggregate=args.aggregate)
            elif metric in ('s3', 'rouge_we', 'cider'):
                output = metric_cls.evaluate_batch(summaries_stemmed, references_stemmed, aggregate=args.aggregate)
            elif metric == "sms":
                output = metric_cls.evaluate_batch(summaries_spacy, references_spacy, aggregate=args.aggregate)
            elif metric in ('summaqa', 'stats', 'supert', 'blanc'):
                if metric == "summaqa":
                    output = metric_cls.evaluate_batch(summaries_space, input_spacy, aggregate=args.aggregate)
                elif metric == "stats":
                    output = metric_cls.evaluate_batch(summaries_spacy_stats, input_spacy_stats,
                                                       aggregate=args.aggregate)
                elif metric in ('supert', 'blanc'):
                    output = metric_cls.evaluate_batch(summaries_space, inputs_space, aggregate=args.aggregate)
            if args.aggregate:
                final_output.update(output)
            else:
                ids = list(range(0, len(ids)))
                for cur_id, cur_output in zip(ids, output):
                    final_output[cur_id].update(cur_output)
        except Exception as e:
            print(e)
            print(f"An error was encountered with the {metric} metric.")
    # =====================================

    # =====================================
    # OUTPUT SCORES
    metrics_str = "_".join(metrics)
    # json_file_end = args.jsonl_file.split("/")[-1]
    json_file_end = args.jsonl_file.replace("/", "_")
    output_path = f"output_{metrics_str}.jsonl"
    print(f"saving to {output_path}")
    # with open(f"outputs/{args.output_file}_{json_file_end}_{metrics_str}.jsonl", "w") as outputf:
    with open(output_path, "w") as outputf:
        if args.aggregate:
            json.dump(final_output, outputf)
        else:
            for key, value in final_output.items():
                value["id"] = key
                json.dump(value, outputf)
                outputf.write("\n")
def main():
    args = parse_args()
    maxlen = args.maxlen
    maxlines = args.maxlines
    public = args.only_public
    start_offset = args.start_offset
    stemmer = SnowballStemmer("english")

    if os.path.exists(args.input + ".pickle"):
        print("loading the cached dataset...")
        with open(args.input + ".pickle", "rb") as fin:
            x, y = pickle.load(fin)
    else:
        vocabulary = {}
        samples_num = 0
        with open(args.input, errors="ignore") as fin:
            for lineno, line in enumerate(fin):
                if lineno % 1000 == 0:
                    print("line #%d" % lineno)
                if lineno > maxlines > 0:
                    break
                ctx = eval(line)
                word = False
                word_num = 0
                for c in ctx:
                    if c == ID_S:
                        word = True
                    elif word:
                        word = False
                        if public and c[0].islower() and c not in BUILTINS:
                            continue
                        word_num += 1
                        for part in extract_names(c):
                            part = stemmer.stem(part)
                            vocabulary.setdefault(part, len(vocabulary))
                samples_num += max(0, word_num - start_offset)
        print("vocabulary:", len(vocabulary), "samples:", samples_num)
        with open(args.output + ".voc", "wb") as fout:
            pickle.dump(vocabulary, fout, protocol=-1)
        x = numpy.zeros((samples_num, maxlen, len(vocabulary)),
                        dtype=numpy.float32)
        y = numpy.zeros((samples_num, len(vocabulary)),
                        dtype=numpy.float32)
        print("the worst is behind - we allocated %s bytes" %
              commaed_int(x.nbytes + y.nbytes))
        samples_num = 0
        with open(args.input, errors="ignore") as fin:
            for lineno, line in enumerate(fin):
                if lineno % 1000 == 0:
                    print("line #%d" % lineno)
                if lineno > maxlines > 0:
                    break
                ctx = eval(line)
                word = False
                words = []
                for c in ctx:
                    if c == ID_S:
                        word = True
                    elif word:
                        word = False
                        if public and c[0].islower() and c not in BUILTINS:
                            continue
                        wadd = tuple(vocabulary[stemmer.stem(p)]
                                     for p in extract_names(c))
                        if wadd:
                            words.append(wadd)
                for i in range(start_offset, len(words)):
                    for j in range(maxlen):
                        k = i - maxlen + j
                        if k >= 0:
                            for c in words[k]:
                                x[samples_num, j, c] = 1
                    for c in words[i]:
                        y[samples_num, c] = 1
                    y[samples_num] /= len(words[i])
                    samples_num += 1
        if args.cache:
            print("saving the cache...")
            try:
                with open(args.input + ".pickle", "wb") as fout:
                    pickle.dump((x, y), fout, protocol=-1)
            except Exception as e:
                print(type(e), e)
    print("x:", x.shape)
    print("y:", y.shape)
    print("shuffling...")
    if args.shuffle:
        numpy.random.seed(777)
        rng_state = numpy.random.get_state()
        numpy.random.shuffle(x)
        numpy.random.set_state(rng_state)
        numpy.random.shuffle(y)
    model = train(x, y, **args.__dict__)
    model.save(args.output, overwrite=True)
Example #47
0
    print('The text belongs to Charles Dickens\'s "A Tale of Two Cities".')
    print(
        'The interviewee attributed their correct guess to how famous the first phrase is.'
    )
    print(
        'The first four content words were readily recognizable to anyone who has read the book.'
    )
    print('No function words were needed to identify the source.')
    print('\n')
    print('\n')

    print("_" * 70)
    print('QUESTION 3: Stemming and Lemmatization: \n')
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    snowball = SnowballStemmer('english')
    porter_stemming = [porter.stem(w) for w in filtered_words]
    lancaster_stemming = [lancaster.stem(w) for w in filtered_words]
    snowball_stemming = [snowball.stem(w) for w in filtered_words]

    #with wrapping
    format = '%s'
    pieces = [format % (word) for word in porter_stemming]
    output = ', '.join(pieces)
    wrapped_porter = fill(output)

    print('The Normalized, Filtered Text Stemmed with PorterStemmer is: \n')
    print(wrapped_porter)

    #with wrapping
    format = '%s'
Example #48
0
import re
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
from stop_list import closed_class_stop_words
import itertools

s = SnowballStemmer("english")

# Stripping cran.qry for the (ID, query) tuple
query_file = open('cran.qry').read()
queries = query_file.split('I')
queries.pop(0)
queries = [tuple(q.split('W')) for q in queries]
queries = [(re.findall(r'\d{3}', i), word_tokenize(s.stem(q)))
           for (i, q) in queries]
queries = [(i, [w for w in q if w not in closed_class_stop_words])
           for (i, q) in queries]
# Strip punctuation later

# Set of all unique words in queries to make dictionary for IDF score
all_queries = [q for (i, q) in queries]
all_queries = set(itertools.chain.from_iterable(all_queries))

# Stripping cran.all.1400 for abstracts
Example #49
0
products = pd.read_csv("../input/producto_tabla.csv")
products['short_name'] = products.NombreProducto.str.extract('^(\D*)',
                                                             expand=False)
products['brand'] = products.NombreProducto.str.extract('^.+\s(\D+) \d+$',
                                                        expand=False)
w = products.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
products['weight'] = w[0].astype('float') * w[1].map({'Kg': 1000, 'g': 1})
products['pieces'] = products.NombreProducto.str.extract(
    '(\d+)p ', expand=False).astype('float')

products['short_name_processed'] = (
    products['short_name'].map(lambda x: " ".join([
        i for i in x.lower().split()
        if i not in nltk.corpus.stopwords.words("spanish")
    ])))
stemmer = SnowballStemmer("spanish")
products['short_name_processed'] = (products['short_name_processed'].map(
    lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()])))

short_name_processed_list = products['short_name_processed'].unique()

vectorizer = CountVectorizer(analyzer="word", \
                             tokenizer=None, \
                             preprocessor=None, \
                             stop_words=None, \
                             max_features=1000)

products = pd.concat([
    products.drop('short_name', axis=1),
    pd.get_dummies(short_name_processed_list)
],
Example #50
0
                    words = ""
                    if len(content) > 1:

                        text_string = content.translate(
                            string.maketrans("", ""), string.punctuation)
                        # text_string =  stripNonAlphaNum(text_string)
                        # text_string =  ' '.join(text_string)
                        text_string = strip_accents(text_string)

                        word_list = text_string.split()

                        # remove only non alpha words from the string
                        word_list = [i for i in word_list if i.isalpha()]

                        # stemming
                        stemmer = SnowballStemmer("english")
                        stem_word_list = [
                            stemmer.stem(word) for word in word_list
                        ]
                        # words = (" " . join(stem_word_list))
                        # print stem_word_list

                        # # remove stopwords
                        from nltk.corpus import stopwords
                        filtered_words = [
                            word for word in stem_word_list
                            if word not in stopwords.words('english')
                        ]
                        # print filtered_words

                        # sys.exit('-random utf-8 check')
Example #51
0
def text2tokens(text, mode):

    emoticons_str = r"""
        (?:
            [:=;] # Eyes
            [oO\-]? # Nose (optional)
            [D\)\]\(\]/\\OpP] # Mouth
        )"""

    regex_str = [
        emoticons_str,
        r'<[^>]+>',  # HTML tags
        r'(?:@[\w_]+)',  # @-mentions
        r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs

        #r'(?:\D)', # no numbers
        r"(?:[a-z][a-z\-_]+[a-z])",  # words with - and 
        r'(?:[\w_]+)',  # other words
        r'(?:\S)',  # anything else
    ]

    tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                           re.VERBOSE | re.IGNORECASE)
    emoticon_re = re.compile(r'^' + emoticons_str + '$',
                             re.VERBOSE | re.IGNORECASE)
    """
    The regular expressions are compiled with the flags re.VERBOSE, to allow spaces in the regexp to be ignored (see the multi-line emoticons regexp),
    and re.IGNORECASE to catch both upper and lowercases. The tokenize() function simply catches all the tokens in a string and returns them as a list.
    This function is used within preprocess(), which is used as a pre-processing chain: in this case we simply add a lowercasing feature for all the
    tokens that are not emoticons (e.g. :D doesn’t become :d).
    """
    punctuation = list(string.punctuation)
    stop = stopwords.words('french') + punctuation + [
        '>>', '<<', '<', '>', 'via', 'le', 'les', 'a', 'rt'
    ]  # Liste des tokens à effacer

    stemmer = SnowballStemmer('french')
    try:
        tokens = tokens_re.findall(unidecode(text))
        tokens = [
            token if emoticon_re.search(token) else token.lower()
            for token in tokens
        ]
        terms_stop = []
        for term in tokens:
            if term not in stop:
                try:
                    int(term)
                except:
                    terms_stop.append(term)
        #terms_stop = [term for term in tokens if term not in stop] # Crée une liste avec tout les termes sauf les termes stopé
        if mode == 't':
            return terms_stop
        if mode == 's':
            terms_stem = [stemmer.stem(term) for term in terms_stop]
            return terms_stem
    except:
        print("Problème dans la tokenisation du text")
        print("texte : ", text, "Type : ", type(text), "Mode : ", mode)
        pass
# Cleaning up the text
messy_sentence = "The point of this example is to _learn how basic text cleaning works_ on *very simple* data."
tokenized_messy_sentence = nltk.word_tokenize(messy_sentence)
table = {ord(char): '' for char in string.punctuation} # in case you're interested, this is called a dict comprehension

cleaned_messy_sentence = []
for token in tokenized_messy_sentence:
    
    cleaned_word = token.translate(table) # the translate method allows us to remove all unwanted charachters
    cleaned_messy_sentence.append(cleaned_word)

print(cleaned_messy_sentence)

# Stemming and Lemmatization
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

porterlemmas = []
wordnetlemmas = []
snowballlemmas = []

for word in tokenized:
    porterlemmas.append(porter.stem(word))
    snowballlemmas.append(snowball.stem(word))
    wordnetlemmas.append(wordnet.lemmatize(word))

print('Porter')
print(porterlemmas)
print('Snowball')
print(snowballlemmas)
tags = []
for text in stopwords:
    tag = bs(text, "lxml")
    tags.append(tag.get_text())
#tags
"""#Convest All to SmallCase"""

sm = []
for i in tags:
    sm.append(i.lower())
"""#SnowBall Stem"""

import nltk
from nltk.stem.snowball import SnowballStemmer

snow = SnowballStemmer(language='english')

stem = []
for i in sm:
    z = i.split()
    strr = ""
    for j in z:
        x = snow.stem(j)
        strr += x
        strr += " "
    stem.append(strr)
#stem
"""#task 1 Completed

---
Example #54
0
scii_funs()

    ASCII - Letter Handling

@author: Markus.Meister
"""
import unittest
import torch
import string
import unicodedata
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

# stemmers for words
stemmEN = SnowballStemmer('english')
stemmDE = SnowballStemmer('german')

stemmers = {
    'en': stemmEN,
    'de': stemmDE,
}

# tokenizer for words
tokenizer = RegexpTokenizer(r'\w+')

all_letters = string.ascii_letters + " .,;'"
all_numbers = ''.join(list(map(lambda x: str(x), range(10))))
BOUND_LOW_CHARS = 26

#%% -- functions --
Example #55
0
print(withoutStop)


# In[ ]:


bof = pd.Series(withoutStop).value_counts()
print(bof)


# In[ ]:


from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("spanish")
stemmed_spanish = [stemmer.stem(item) for item in withoutStop]
print(stemmed_spanish)


# In[ ]:


from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np



# Helper function
Example #56
0
import re
from nltk.stem.snowball import SnowballStemmer
import pymorphy2

from env import project_id, private_key, credentials, stops

import json
from google.cloud import bigquery
from pandas.io import gbq

import pandas as pd
morph = pymorphy2.MorphAnalyzer()

stemmer = SnowballStemmer('russian')


def search_user_library(username, q='', mode='title'):
    try:
        q = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", q)
        q = q.lower()
        words = q.split()
        words = [w for w in words if not w in stops]
        words = [stemmer.stem(w) for w in words]
        if words == '':
            return "некорректный ввод"

        if mode == 'author':
            Query = 'SELECT * FROM dataset.' + username + ' WHERE AUTHOR LIKE \''
            for word in words:
                Query += '%{}'.format(word)
            Query += '%\''
Example #57
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:


#!/usr/bin/python

from nltk.stem.snowball import SnowballStemmer
import string
import nltk
obj_stem = SnowballStemmer('english')


# In[2]:


def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """

Example #58
0
#!/bin/python
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') #do not understand the parameter
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english") #have to set to english
#these are middle words as pronouns and prepositions
#i do not think they will be much of a sentimental word
middle_words = ['and','a','the','am','it','me','with','in','on','by','near','this','that','an','there','here','those']
#store these words into a dictionary and store them in set
middle_words = set(dict.fromkeys([stemmer.stem(word) for word in middle_words]))

def read_files(tarfname):
    """Read the training and development data from the sentiment tar file.
        The returned object contains various fields that store sentiment data, such as:
        
        train_data,dev_data: array of documents (array of words)
        train_fnames,dev_fnames: list of filenames of the doccuments (same length as data)
        train_labels,dev_labels: the true string label for each document (same length as data)
        
        The data is also preprocessed for use with scikit-learn, as:
        
        count_vec: CountVectorizer used to process the data (for reapplication on new data)
        trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer
        le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication)
        target_labels: List of labels (same order as used in le)
        trainy,devy: array of int labels, one for each document
        """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    trainname = "train.tsv"
    devname = "dev.tsv"
Example #59
0
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
 
    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase
 
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
 
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
 
        'prev-iob': previob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
 
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
    }
Example #60
0
import pickle
import collections
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english', ignore_stopwords=True)


def print_key_value(l, k):
    for key, value in l[:k]:
        print("{} {}".format(key, value))


def left(l):
    r = []
    for k, v in l:
        r.append(k)
    return r


def makedict(l):
    d = dict()
    for k, v in l:
        if k not in d:
            d[k] = v
    return d


rev_stem_dict = dict()


def stem(word):