def pos_tokenizer(s): #define a tokenizer that uses POS tagging texts=nltk.word_tokenize(s) texts=[word for word in texts if len(word)>2] # PULL OUT NOUN AND VERB PHRASES chunktext=nltk.pos_tag(texts) patterns=""" VP:{<V.*><DT>?<JJ.*>?<NN.*>} NP:{<DT>?<JJ>*<NN.*>} N:{<NN.*>} """ NPchunker=nltk.RegexpParser(patterns) from nltk.stem.snowball import SnowballStemmer st=SnowballStemmer('english') #print text temp=[] result=NPchunker.parse(chunktext) #print result for phrase in result: try: phrase.label() string='' m=0 for word in phrase: if m==0: string+=st.stem(word[0]) m+=1 else: string+=' '+st.stem(word[0]) temp.append(string) except: pass return temp
def test_spanish(self): stemmer = SnowballStemmer('spanish') assert stemmer.stem("Visionado") == 'vision' # The word 'algue' was raising an IndexError assert stemmer.stem("algue") == 'algu'
def main(): parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.') parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref', help='input file (default data/hyp1-hyp2-ref)') parser.add_argument('-n', '--num_sentences', default=None, type=int, help='Number of hypothesis pairs to evaluate') # note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint opts = parser.parse_args() # we create a generator and avoid loading all sentences into a list def sentences(): with open(opts.input) as f: for pair in f: yield [sentence.strip().split() for sentence in pair.split(' ||| ')] english_stemmer = SnowballStemmer("english") # note: the -n option does not work in the original code for h1, h2, ref in islice(sentences(), opts.num_sentences): # Perform morphological stemming before calculating METEOR score h1 = [english_stemmer.stem(word) for word in h1] h2 = [english_stemmer.stem(word) for word in h2] ref = [english_stemmer.stem(word) for word in ref] rset = set(ref) h1_match = meteor(h1, rset) # print "meteor is h1_match ", h1_match h2_match = meteor(h2, rset) # print "meteor is h2_match ", h2_match print(1 if h1_match > h2_match else # \begin{cases} (0 if h1_match == h2_match else -1)) # \end{cases}
def classify(self, sText): """Given a target string sText, this function returns the most likely document class to which the target string belongs (i.e., positive, negative or neutral). """ tokens = self.tokenize(sText) posProbability, negProbability = 0, 0 posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values())) stemmer = SnowballStemmer("english") for i in range(len(tokens) - 1): if not isPunctuationMark(tokens[i]): unigram = stemmer.stem(tokens[i]) second_word = stemmer.stem(tokens[i + 1]) try: bigram = unigram + " " + second_word except UnicodeDecodeError: continue #adds one smoothing and takes log to avoid underflow posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum) posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum) negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum) negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum) if tokens: posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum) negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum) if posProbability > negProbability: return "positive" else: return "negative"
class WordCount: def __init__(self, language): self.stopwords = self.load_stopwords(language) self.parse_regexp = re.compile(r"([0-9]*[\w][\w0-9]+)", re.UNICODE) self.current_stemmer = SnowballStemmer(language) @staticmethod def load_stopwords(language): stoplist = [] if language == 'english': with codecs.open('geomedia'+ os.sep +'en_stoplist.txt', "r", "utf-8") as f: stoplist = [line.rstrip() for line in f] else: #download('stopwords') stoplist = stopwords.words(language) return stoplist def parse_text(self, text, wordcount_dictionary=None): """ >>> wordcount = WordCount() #doctest: +ELLIPSIS [nltk_data] ... >>> wordcount.parse_text("a1a ma kota") {'ma': 1, 'a1a': 1, 'kota': 1} >>> wordcount.parse_text("a1a ma kota", {'a1a': 2, 'kota': 1}) {'ma': 1, 'a1a': 3, 'kota': 2} """ if wordcount_dictionary is None: wordcount_dictionary = {} words = self.parse_regexp.findall(text) for word in words: new_word = self.current_stemmer.stem(word.lower()) if word not in self.stopwords and new_word not in self.stopwords: if new_word in wordcount_dictionary: wordcount_dictionary[new_word] += 1 else: wordcount_dictionary[new_word] = 1 return wordcount_dictionary def parse_text_extra(self, text, wordcount_dictionary=None, extras=None): if wordcount_dictionary is None: wordcount_dictionary = {} if wordcount_dictionary is None: extras = {} words = self.parse_regexp.findall(text) for word in words: new_word = self.current_stemmer.stem(word.lower()) word = word.lower() if word not in self.stopwords and new_word not in self.stopwords: if new_word in wordcount_dictionary: wordcount_dictionary[new_word] += 1 if word in extras[new_word]: extras[new_word][word] += 1 else: extras[new_word][word] = 1 else: wordcount_dictionary[new_word] = 1 extras[new_word] = {} extras[new_word][word] = 1
def stem_snowball(tokens): stemmer = SnowballStemmer("russian") if isinstance(tokens, basestring): return stemmer.stem(tokens) else: stemmed = [stemmer.stem(token) for token in tokens] return stemmed
def stem(list): stemmer = SnowballStemmer('english') stemmed_tokens = [] for x in list: stemmed_tokens.append(stemmer.stem(x)) terms_dictionary.update_terms_dictionary(stemmer.stem(x), x) #creo il dizionario di token e termini originali return stemmed_tokens
def test_german(self): stemmer_german = SnowballStemmer("german") stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) assert stemmer_german.stem("Schr\xe4nke") == 'schrank' assert stemmer_german2.stem("Schr\xe4nke") == 'schrank' assert stemmer_german.stem("keinen") == 'kein' assert stemmer_german2.stem("keinen") == 'keinen'
def extract_bigrams(articleList, commentCount): featureMatrix = np.zeros([commentCount,100]) index = 0 stemmer = SnowballStemmer("english", ignore_stopwords=True) bagOfWords = [] for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bagOfWords += stemmed_words bagOfWords.append("\n") tempVector = dict() #Create your bigrams bgs = nltk.bigrams(bagOfWords) fdist = nltk.FreqDist(bgs) for k in fdist.keys()[:100]: tempVector[k] = 0 theKeys = tempVector.keys() for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bgs = nltk.bigrams(stemmed_words) for word in (w for w in bgs if tempVector.has_key(w)): keyInd = theKeys.index(word) featureMatrix[index][keyInd] += 1 index += 1 if index % 100 == 0: print "extracted", index, "features" if index >= commentCount: break print "non-zero",np.count_nonzero(featureMatrix) print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1])) return featureMatrix
def get_unigram_feats(document): document_words = set(document.split()) s = SnowballStemmer("english") stemmed_words = [ s.stem(word) for word in document_words ] features = {} #features['count'] = len(document_words) for word in data.wordlist: word = s.stem(word) features['contains({})'.format(word)] = (word in stemmed_words) return features
def highestFrequency(quesWords,sentWords): stemmer = SnowballStemmer("english"); match = 0 nonMatch = 0 for qw in quesWords: for aw in sentWords: if stemmer.stem(qw) == stemmer.stem(aw) : match += 1 else: nonMatch += 1 return (match)
def jaccardDistance(quesWords,sentWords): stemmer = SnowballStemmer("english"); match = 0 nonMatch = 0 for qw in quesWords: for aw in sentWords: if stemmer.stem(qw) == stemmer.stem(aw) : match += 1 else: nonMatch += 1 return (match)
def preProcessing(bitext): # transfer to lower case bitext = [[[x.lower() for x in sent ] for sent in bisent] for bisent in bitext] # stemmer e_stemmer = SnowballStemmer("german") f_stemmer = SnowballStemmer("english") for (n, (f,e)) in enumerate(bitext): for idx, f_i in enumerate(f): f[idx] = f_stemmer.stem(f_i) for idx, e_i in enumerate(e): e[idx] = e_stemmer.stem(e_i)
def process_missing(missing, sec): st = SnowballStemmer('english') morphological_errors = 0 for m in missing: ind = sec['incorrect'].index(m) prediction = sec['predicted'][ind] if(st.stem(m[3]) == st.stem(prediction[0])): morphological_errors += 1 print('the correct sequence is: '+str(m)+' but predicted: '+str(prediction)) print('morphological errors:' + str(morphological_errors)) if len(missing): print('percentage:' + str(morphological_errors/len(missing)))
def trigram(self,term): x,y,z =term stemmer=SnowballStemmer("english") x= stemmer.stem(x) y= stemmer.stem(y) z= stemmer.stem(z) label=x+y+z new_column=[] for words_stem in self.stemwords: if x in words_stem and y in words_stem and z in words_stem: new_column.append('True') else: new_column.append('False') self.dataframegenerator(new_column,label)
def imprimir_resto(clase, puesto, descrip, req): #lineaTotal = filter(lambda x: x in string.printable, lineaTotal) archEscritura.write(clase) archEscritura.write(",") blobPuesto = TextBlob(puesto.decode('utf-8')) blobDescrip = TextBlob(descrip.decode('utf-8')) blobReq = TextBlob(req.decode('utf8', 'ignore')) wordsPuesto = blobPuesto.words wordsDescrip = blobDescrip.words wordsReq = blobReq.words for wordP in wordsPuesto: nword = strip_accents(wordP) exclude = set(string.punctuation) nword = ''.join(ch for ch in nword if ch not in exclude) nword = nword.lower() nword = filter(lambda x: x in string.printable, nword) archEscritura.write(nword) archEscritura.write(" ") archEscritura.write(",") stemmer = SnowballStemmer("spanish") cad = "" for wordD in wordsDescrip: nwordD = strip_accents(wordD) exclude = set(string.punctuation) nwordD = ''.join(ch for ch in nwordD if ch not in exclude) nwordD = filter(lambda x: x in string.printable, nwordD) if nwordD not in (stopwords.words('spanish')):#Elimnimar Stop words w=Word(nwordD) #comentarios.append(w) word2= stemmer.stem(w.lower()) archEscritura.write(word2) archEscritura.write(" ") archEscritura.write(",") lista = [] for wordP in wordsReq: nwordP = strip_accents(wordP) exclude = set(string.punctuation) nwordP = ''.join(ch for ch in nwordP if ch not in exclude) nwordP = filter(lambda x:x in string.printable, nwordP) if nwordP not in (stopwords.words('spanish')): w=Word(nwordP) word3 = stemmer.stem(w.lower()) if word3 not in lista: lista.append(word3) for pal in lista: archEscritura.write(pal) archEscritura.write(" ") archEscritura.write("\n")
def clean_single_word(word, lemmatizing="wordnet"): """ Performs stemming or lemmatizing on a single word. If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing. Inputs: - word: A string containing the source word. - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet". Output: - lemma: The resulting clean lemma or stem. """ if lemmatizing == "porter": porter = PorterStemmer() lemma = porter.stem(word) elif lemmatizing == "snowball": snowball = SnowballStemmer('english') lemma = snowball.stem(word) elif lemmatizing == "wordnet": wordnet = WordNetLemmatizer() lemma = wordnet.lemmatize(word) else: print("Invalid lemmatizer argument.") raise RuntimeError return lemma
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation) ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) stemmer = SnowballStemmer("english") stemmed_words = [] for word in text_string.split(): stemmed_words.append(stemmer.stem(word.strip())) words = " ".join(stemmed_words) return words
class snowballStemmer: def __init__(self): self.stemmer = SnowballStemmer("english") def stem(self,keyword_score, keyword_idf_dir, keyword_tf_dir): stem_dict = {} stem_dict_score = {} print "in stem function.............." for key in keyword_score.iterkeys(): root = self.stemmer.stem(key.keyword) if stem_dict.has_key(root): stem_dict[root]['words'].append(key) if key.is_title() or key.is_tag(): stem_dict[root]['boost'] = 1 if stem_dict[root]['tf'] < keyword_tf_dir[key]: stem_dict[root]['tf'] = keyword_tf_dir[key] else: stem_dict[root] = {} stem_dict[root]['boost'] = 0 stem_dict[root]['words'] = [] stem_dict[root]['words'].append(key) stem_dict[root]['idf'] = keyword_idf_dir[key] stem_dict[root]['tf'] = keyword_tf_dir[key] if key.is_title() or key.is_tag(): stem_dict[root]['boost'] = 1 for root in stem_dict.iterkeys(): stem_dict_score[root] = stem_dict[root]['idf']*stem_dict[root]['tf'] # print stem_dict_score return stem_dict, stem_dict_score
def tokenize(self, document): """ Break text into sentences and each sentence into a list of single words Ignore any token that falls into the stopwords set. """ # use sentence tokenizer sent_tokenize from nltk package sentences = sent_tokenize(utils.to_unicode(document.lower())) # create stemmer of class SnowballStemmer stemmer = SnowballStemmer("english") for sentence in sentences: words = [word for word in utils.tokenize( self.cleanse_text(sentence) )] if self.remove_stopwords: words = [ word for word in words if word not in self.en_stopwords ] if self.stemming: words = [stemmer.stem(t) for t in words] yield words
def get_stemm_tags(self, tags): stemm_tags = [] current_stemmer = SnowballStemmer('english') for tag in self.tags: stemm_tags.append(current_stemmer.stem(tag.lower())) return stemm_tags
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" stemmer = SnowballStemmer("english") if len(content) > 1: text_string = content[1].translate(string.maketrans("", ""), string.punctuation) split = text_string.split() text = [stemmer.stem(word) for word in split] words = ' '.join(text) f.close() return words.strip()
class Preprocessor(object): # 对各种操作进行初始化 def __init__(self): # 创建正则表达式解析器 self.tokenizer = RegexpTokenizer(r'\w+') # 获取停用词列表 self.stop_words_english = stopwords.words('english') # 创建Snowball词干提取器 self.stemmer = SnowballStemmer('english') # 标记解析、移除停用词、词干提取 def process(self, input_text): # 标记解析 tokens = self.tokenizer.tokenize(input_text.lower()) #移除停用词 tokens_stopwords = [x for x in tokens if not x in self.stop_words_english] # 词干提取 tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords] # 返回处理后的标记 return tokens_stemmed
def text_cleaner_and_tokenizer(texts): """ takes a list of sentences, removes punctuation, numbers, stopwords and stems. Then joins everything back together and returns the filtered texts as a list of unicode strings :param texts: list of unprocessed strings :return: list of unicode strings """ i = 0 stopword_list = set(stopwords.words('danish')) stemmer = SnowballStemmer("danish", ignore_stopwords=False) filtered_texts = [] for sentence in texts: for symbol in punctuation: sentence = sentence.replace(symbol,'') for num in numbers: sentence = sentence.replace(str(num),'') sentence = sentence.decode('utf-8').lower() words_in_sentence = word_tokenize(sentence, language='danish') filtered_sentence = [] for word in words_in_sentence: if word not in stopword_list: stem_word = stemmer.stem(word) filtered_sentence.append(stem_word) sentence = ' '.join(filtered_sentence) filtered_texts.append(sentence) i = i +1 if i % 1000 == 0: print(i) print('Done :D!') return filtered_texts
def tokenize(string, stem=True, entire=False): """ INPUT: string OUTPUT: a list of words """ string = string.replace("/", " ") string = string.replace("-", " ") tokenizer = PottsTokenizer(preserve_case=False) token_list = tokenizer.tokenize(string) punctuation = re.compile(r'[-.?!,":;$/*()|0-9]') # remove these punctuations and number token_list = [punctuation.sub("", word) for word in token_list] token_list = filter(None, token_list) #filters empty #filter out stopwords STOPWORDS = set(nltk.corpus.stopwords.words('english')) STOPWORDS.update(('would','does','got',"doesn't","it's","isn't","don't","i'm","i'll","i've", "=","can't","didn't","etc","+","%","won't","that's","nikon","g","&", "sure", "may", "yet", "ok","haven't","else","maybe","wouldn't","couldn't","via","rt","'","you're","almost","v","there's","#",'well','somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere')) if entire: # if need a larger set stopwords_entire_list = loadEntireStopWord() STOPWORDS.update(set(stopwords_entire_list)) token_list = [word for word in token_list if word not in STOPWORDS] #stemmer if stem: stemmer = SnowballStemmer("english") token_stem_list = [stemmer.stem(token) for token in token_list] token_list = token_stem_list return token_list
def tokenize(s, stem=True, digit=False, stop=True, use_re=False): """ :type s: str :type stem: bool :type use_re: bool :rtype: set(str) """ stop_words = stopwords.words('english') stemmer = SnowballStemmer('english') wordnet = WordNetLemmatizer() table = string.maketrans("","") if use_re: s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) if digit: tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits))) else: tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation))) if stop: tokens = set(word for word in tokens if word not in stop_words) if stem: tokens = set(stemmer.stem(word) for word in tokens) return tokens
def __init__(self,df, column,n ): # gets the most frecuent words in a document texto = " ".join(str(x) for x in df[column].values) tokens = texto.split() tokens=[x.lower() for x in tokens] #stopset = set(stopwords.words('english')) # dictionary of stop words #tokens = [w for w in tokens if not w in stopset] stemmer=SnowballStemmer("english") stemm_words=[] tokens_clean=[] for j in tokens: sa=re.sub('[^A-Za-z]+', '', j) tokens_clean.append(sa) #print tokens_clean for s in tokens_clean: try: stem= stemmer.stem(s) if s!='': stemm_words.append(str(stem)) except: pass cuenta = len(tokens_clean) largo = Counter(stemm_words).most_common(n) topdic = dict(largo) asortado = Series(topdic) asortadol = asortado.columns = ['a', 'b'] ordenado = asortado.order(ascending=False) ordenadolist= topdic.keys() #+stemm_words self.top=ordenadolist
def parseOutBody(f): from nltk.stem.snowball import SnowballStemmer import string f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation).split() ### project part 2: comment out the line below #words = text_string ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) stemmer = SnowballStemmer('english') for word in text_string: word = word.strip() word = stemmer.stem(word) words = words + ' ' + word else: pass return words
def stem_stopword_clean( vett_strings ): ''' Prende un vettore di studenti o lavori ongli elemento delle lista unico e stemmato. Divide elementi composti da piu parole, rimuove le STOPwords :param vett_value: vettore di stringhe :return: vettore di parole stem senza stopwords ''' # importo libreria per stem from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords stemmer = SnowballStemmer("italian") stop = set(stopwords.words('italian')) # logger.error(stemmer.stem("italian")) # logger.error(stemmer.stem("a")) # logger.error(stemmer.stem("andate tutti a correre")) documents=[] # logger.error(stop) stem_parola='' for frasi in vett_strings: for parola in frasi.split(" "): stem_parola=stemmer.stem(parola) if(stem_parola not in stop and stem_parola not in documents): documents.append(stem_parola) return documents
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top, stem words and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation) ### split the text string into individual words, stemming each word, ### and appending the stemmed word to words words = text_string.strip().split() stemmer = SnowballStemmer("english") stemmed_text_string = "" for word in words: stemmed_text_string += stemmer.stem(word) + " " return stemmed_text_string.strip()
class OntologyMatchingDatasetReader(DatasetReader): """ Reads instances from a jsonlines file where each line is in the following format: {"match": X, "source": {kb_entity}, "target: {kb_entity}} X in [0, 1] kb_entity is a slightly modified KBEntity in json with fields: canonical_name aliases definition other_contexts relationships and converts it into a ``Dataset`` suitable for ontology matching. Parameters ---------- token_delimiter: ``str``, optional (default=``None``) The text that separates each WORD-TAG pair from the next pair. If ``None`` then the line will just be split on whitespace. token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. Note that the `output` tags will always correspond to single token IDs based on how they are pre-tokenised in the data file. """ def __init__(self, tokenizer: Tokenizer = None, name_token_indexers: Dict[str, TokenIndexer] = None, token_only_indexer: Dict[str, TokenIndexer] = None) -> None: self._name_token_indexers = name_token_indexers or \ {'tokens': SingleIdTokenIndexer(namespace="tokens"), 'token_characters': TokenCharactersIndexer(namespace="token_characters")} self._token_only_indexer = token_only_indexer or \ {'tokens': SingleIdTokenIndexer(namespace="tokens")} self._tokenizer = tokenizer or WordTokenizer() self._empty_token_text_field = TextField( self._tokenizer.tokenize('00000'), self._token_only_indexer) self._empty_list_token_text_field = ListField([ TextField(self._tokenizer.tokenize('00000'), self._token_only_indexer) ]) self.PARENT_REL_LABELS = constants.UMLS_PARENT_REL_LABELS self.CHILD_REL_LABELS = constants.UMLS_CHILD_REL_LABELS self.STOP = set(stopwords.words('english')) self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+') self.stemmer = SnowballStemmer("english") self.lemmatizer = WordNetLemmatizer() self.nlp = spacy.load('en') @overrides def read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] # open data file and read lines with open(file_path, 'r') as ontm_file: logger.info( "Reading ontology matching instances from jsonl dataset at: %s", file_path) for line in tqdm.tqdm(ontm_file): training_pair = json.loads(line) s_ent = training_pair['source_ent'] t_ent = training_pair['target_ent'] label = training_pair['label'] # convert entry to instance and append to instances instances.append(self.text_to_instance(s_ent, t_ent, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances) @staticmethod def _normalize_ent(ent): norm_ent = dict() norm_ent['canonical_name'] = string_utils.normalize_string( ent['canonical_name']) norm_ent['aliases'] = [ string_utils.normalize_string(a) for a in ent['aliases'] ] norm_ent['definition'] = string_utils.normalize_string( ent['definition']) norm_ent['par_relations'] = set( [string_utils.normalize_string(i) for i in ent['par_relations']]) norm_ent['chd_relations'] = set( [string_utils.normalize_string(i) for i in ent['chd_relations']]) return norm_ent def _compute_tokens(self, ent): """ Compute tokens from given entity :param ent: :return: """ name_tokens = string_utils.tokenize_string(ent['canonical_name'], self.tokenizer, self.STOP) stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens]) lemmatized_tokens = tuple( [self.lemmatizer.lemmatize(w) for w in name_tokens]) character_tokens = tuple( string_utils.get_character_n_grams(ent['canonical_name'], constants.NGRAM_SIZE)) alias_tokens = [ string_utils.tokenize_string(a, self.tokenizer, self.STOP) for a in ent['aliases'] ] def_tokens = string_utils.tokenize_string(ent['definition'], self.tokenizer, self.STOP) return [ name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens, alias_tokens, def_tokens ] def _dependency_parse(self, name): """ compute dependency parse of name and return root word, and all chunk root words :param name: name string :return: """ doc = self.nlp(name) root_text = [(token.dep_, token.head.text) for token in doc] root = [t for d, t in root_text if d == 'ROOT'][0] root_words = set([t for d, t in root_text]) return root, root_words def _get_features(self, s_ent, t_ent): """ compute all LR model features :param s_ent: :param t_ent: :return: """ s_name_tokens, s_stem_tokens, s_lemm_tokens, s_char_tokens, s_alias_tokens, s_def_tokens = self._compute_tokens( s_ent) t_name_tokens, t_stem_tokens, t_lemm_tokens, t_char_tokens, t_alias_tokens, t_def_tokens = self._compute_tokens( t_ent) has_same_canonical_name = (s_name_tokens == t_name_tokens) has_same_stemmed_name = (s_stem_tokens == t_stem_tokens) has_same_lemmatized_name = (s_lemm_tokens == t_lemm_tokens) has_same_char_tokens = (s_char_tokens == t_char_tokens) has_alias_in_common = (len( set(s_alias_tokens).intersection(set(t_alias_tokens))) > 0) # initialize similarity features name_token_jaccard_similarity = 1.0 inverse_name_token_edit_distance = 1.0 name_stem_jaccard_similarity = 1.0 inverse_name_stem_edit_distance = 1.0 name_lemm_jaccard_similarity = 1.0 inverse_name_lemm_edit_distance = 1.0 name_char_jaccard_similarity = 1.0 inverse_name_char_edit_distance = 1.0 # jaccard similarity and token edit distance max_changes = len(s_name_tokens) + len(t_name_tokens) max_char_changes = len(s_char_tokens) + len(t_char_tokens) if not has_same_canonical_name: name_token_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_name_tokens), set(t_name_tokens)) inverse_name_token_edit_distance = 1.0 - edit_distance( s_name_tokens, t_name_tokens) / max_changes if not has_same_stemmed_name: name_stem_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_stem_tokens), set(t_stem_tokens)) inverse_name_stem_edit_distance = 1.0 - edit_distance( s_stem_tokens, t_stem_tokens) / max_changes if not has_same_lemmatized_name: name_lemm_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_lemm_tokens), set(t_lemm_tokens)) inverse_name_lemm_edit_distance = 1.0 - edit_distance( s_lemm_tokens, t_lemm_tokens) / max_changes if not has_same_char_tokens: name_char_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_char_tokens), set(t_char_tokens)) inverse_name_char_edit_distance = 1 - edit_distance( s_char_tokens, t_char_tokens) / max_char_changes max_alias_token_jaccard = 0.0 min_alias_edit_distance = 1.0 best_s_alias = s_ent['aliases'][0] best_t_alias = t_ent['aliases'][0] if not has_alias_in_common: for s_ind, s_a_tokens in enumerate(s_alias_tokens): for t_ind, t_a_tokens in enumerate(t_alias_tokens): if s_a_tokens and t_a_tokens: j_ind = string_utils.get_jaccard_similarity( set(s_a_tokens), set(t_a_tokens)) if j_ind > max_alias_token_jaccard: max_alias_token_jaccard = j_ind best_s_alias = s_ent['aliases'][s_ind] best_t_alias = t_ent['aliases'][t_ind] e_dist = edit_distance(s_a_tokens, t_a_tokens) / ( len(s_a_tokens) + len(t_a_tokens)) if e_dist < min_alias_edit_distance: min_alias_edit_distance = e_dist # has any relationships has_parents = (len(s_ent['par_relations']) > 0 and len(t_ent['par_relations']) > 0) has_children = (len(s_ent['chd_relations']) > 0 and len(t_ent['chd_relations']) > 0) percent_parents_in_common = 0.0 percent_children_in_common = 0.0 # any relationships in common if has_parents: max_parents_in_common = (len(s_ent['par_relations']) + len(t_ent['par_relations'])) / 2 percent_parents_in_common = len( s_ent['par_relations'].intersection( t_ent['par_relations'])) / max_parents_in_common if has_children: max_children_in_common = (len(s_ent['chd_relations']) + len(t_ent['chd_relations'])) / 2 percent_children_in_common = len( s_ent['chd_relations'].intersection( t_ent['chd_relations'])) / max_children_in_common s_acronyms = [(i[0] for i in a) for a in s_alias_tokens] t_acronyms = [(i[0] for i in a) for a in t_alias_tokens] has_same_acronym = (len(set(s_acronyms).intersection(set(t_acronyms))) > 0) s_name_root, s_name_heads = self._dependency_parse( s_ent['canonical_name']) t_name_root, t_name_heads = self._dependency_parse( t_ent['canonical_name']) has_same_name_root_word = (s_name_root == t_name_root) has_same_name_chunk_heads = (s_name_heads == t_name_heads) name_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity( s_name_heads, t_name_heads) s_alias_root, s_alias_heads = self._dependency_parse(best_s_alias) t_alias_root, t_alias_heads = self._dependency_parse(best_t_alias) has_same_alias_root_word = (s_alias_root == t_alias_root) has_same_alias_chunk_heads = (s_alias_heads == t_alias_heads) alias_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity( s_alias_heads, t_alias_heads) def_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_def_tokens), set(t_def_tokens)) # form feature vector feature_vec = [ FloatField(float(has_same_canonical_name)), FloatField(float(has_same_stemmed_name)), FloatField(float(has_same_lemmatized_name)), FloatField(float(has_same_char_tokens)), FloatField(float(has_alias_in_common)), FloatField(name_token_jaccard_similarity), FloatField(inverse_name_token_edit_distance), FloatField(name_stem_jaccard_similarity), FloatField(inverse_name_stem_edit_distance), FloatField(name_lemm_jaccard_similarity), FloatField(inverse_name_lemm_edit_distance), FloatField(name_char_jaccard_similarity), FloatField(inverse_name_char_edit_distance), FloatField(max_alias_token_jaccard), FloatField(1.0 - min_alias_edit_distance), FloatField(percent_parents_in_common), FloatField(percent_children_in_common), FloatField(float(has_same_acronym)), FloatField(float(has_same_name_root_word)), FloatField(float(has_same_name_chunk_heads)), FloatField(name_chunk_heads_jaccard_similarity), FloatField(float(has_same_alias_root_word)), FloatField(float(has_same_alias_chunk_heads)), FloatField(alias_chunk_heads_jaccard_similarity), FloatField(def_jaccard_similarity) ] return feature_vec @overrides def text_to_instance( self, # type: ignore s_ent: dict, t_ent: dict, label: str = None) -> Instance: # pylint: disable=arguments-differ # sample n from list l, keeping only entries with len less than max_len # if n is greater than the length of l, just return l def sample_n(l, n, max_len): l = [i for i in l if len(i) <= max_len] if not l: return ['00000'] if len(l) <= n: return l return random.sample(l, n) fields: Dict[str, Field] = {} fields['sparse_features'] = ListField( self._get_features(self._normalize_ent(s_ent), self._normalize_ent(t_ent))) # tokenize names s_name_tokens = self._tokenizer.tokenize('00000 ' + s_ent['canonical_name']) t_name_tokens = self._tokenizer.tokenize('00000 ' + t_ent['canonical_name']) # add entity name fields fields['s_ent_name'] = TextField(s_name_tokens, self._name_token_indexers) fields['t_ent_name'] = TextField(t_name_tokens, self._name_token_indexers) s_aliases = sample_n(s_ent['aliases'], 16, 128) t_aliases = sample_n(t_ent['aliases'], 16, 128) # add entity alias fields fields['s_ent_aliases'] = ListField([ TextField(self._tokenizer.tokenize('00000 ' + a), self._name_token_indexers) for a in s_aliases ]) fields['t_ent_aliases'] = ListField([ TextField(self._tokenizer.tokenize('00000 ' + a), self._name_token_indexers) for a in t_aliases ]) # add entity definition fields fields['s_ent_def'] = TextField( self._tokenizer.tokenize( s_ent['definition']), self._token_only_indexer ) if s_ent['definition'] else self._empty_token_text_field fields['t_ent_def'] = TextField( self._tokenizer.tokenize( t_ent['definition']), self._token_only_indexer ) if t_ent['definition'] else self._empty_token_text_field # add entity context fields s_contexts = sample_n(s_ent['other_contexts'], 16, 256) t_contexts = sample_n(t_ent['other_contexts'], 16, 256) fields['s_ent_context'] = ListField([ TextField(self._tokenizer.tokenize(c), self._token_only_indexer) for c in s_contexts ]) fields['t_ent_context'] = ListField([ TextField(self._tokenizer.tokenize(c), self._token_only_indexer) for c in t_contexts ]) # add boolean label (0 = no match, 1 = match) fields['label'] = BooleanField(label) return Instance(fields) @classmethod def from_params(cls, params: Params) -> 'OntologyMatchingDatasetReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) name_token_indexers = TokenIndexer.dict_from_params( params.pop('name_token_indexers', {})) token_only_indexer = TokenIndexer.dict_from_params( params.pop('token_only_indexer', {})) params.assert_empty(cls.__name__) return OntologyMatchingDatasetReader( tokenizer=tokenizer, name_token_indexers=name_token_indexers, token_only_indexer=token_only_indexer)
documents = [] ignore_words = ['?', '\''] # loop through each sentence in the training data for pattern in training_data: # tokenize each work in the sentence w = nltk.word_tokenize(pattern['sentence']) # add to words list words.extend(w) # add to documents in corpus documents.append((w, pattern['class'])) # add to classes list if pattern['class'] not in classes: classes.append(pattern['class']) # stem and lower each work and remove duplicates words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] words = list(set(words)) # remove duplicates classes = list(set(classes)) print(len(documents), "documents") print(len(classes), "classes", classes) print(len(words), "words", words) # create training data training = [] output = [] # create an empty array for output data output_empty = [0] * len(classes)
def matrix(self,domain,independent,domainb=[('without value','withoutvalue')]): # creates a matrix M from the paper cross domain sentiment classification stemmer=SnowballStemmer("english") #################################################### domaincheck=domain domainl=domain domain1,domain2=map(list, zip(*domainb)) domain1= list(map(stemmer.stem, domain1)) domain2= list(map(stemmer.stem, domain2)) matrixM=DataFrame(0,index=domainl, columns=independent) joinf=joindocuments(df1,df2) undersampleddf=joinf.join(self.df1,self.df2) for x in undersampleddf[self.column].values: tokens = x.split() tokens=[x.lower() for x in tokens] stemm_words=[] tokens_clean=[] for j in tokens: sa=re.sub('[^A-Za-z]+', '', j) tokens_clean.append(sa) for s in tokens_clean: try: stem= stemmer.stem(s) if s!='': stemm_words.append(str(stem)) except: pass inter=set(domain).intersection(stemm_words) #find the intersection two lists intersection1= list(inter) inter1=set(independent).intersection(stemm_words) #find the intersection two lists intersection2= list(inter1) inter3=set(domain1).intersection(stemm_words) #find the intersection two lists intersection3= list(inter3) inter4=set(domain2).intersection(stemm_words) #find the intersection two lists intersection4= list(inter4) if intersection1: if intersection2: for x in intersection1: for y in intersection2: matrixM.xs(x)[y]=matrixM.xs(x)[y]+1 if intersection3: if intersection4: if intersection2: for x1 in intersection3: for y1 in intersection4: for z1 in intersection2: label=x1+y1 if label in domain: matrixM.xs(label)[z1]=matrixM.xs(label)[z1]+1 return matrixM
def stem_tokens(self, tokens): from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") return [stemmer.stem(t) for t in tokens]
class Data(Sent): def __init__(self, directory_path = '', process_code = 0, outpath = ''): # Whether or not we want to take a subset of the dataframe self.subset = config_subset self.process_code = process_code self.outpath = outpath self.dataframe = pd.read_pickle(directory_path + 'all_the_news.pkl') # if self.subset: # self.dataframe = pd.read_pickle(directory_path + 'all_the_news.pkl').sample(frac=config_subsample_size) # self.labels = self.dataframe.index.tolist() # self.dataframe = self.dataframe.reset_index(drop=True) # else: # self.dataframe = pd.read_pickle(directory_path + 'all_the_news.pkl') # self.labels = self.dataframe.index.tolist() #self.nlp = en_core_web_sm.load() self.stopwords = stopwords.words('english')#{s : True for s in stopwords.words('english')} self.stemmer = SnowballStemmer('english') self.lmtzr = WordNetLemmatizer() # Will process all the text in spacy if we want to. Lot's of different nlp options. Note, rate is about 5000 # articles per hour on 16gb RAM def __spacy__(self): if self.subset: self.dataframe = self.dataframe.sample(frac=config_subsample_size).reset_index(drop=True) start = time() self.spacy_text = {} for idx, row in self.dataframe.iterrows(): self.spacy_text[idx] = self.nlp(row['content']) if not idx % 5000: print(idx, "rows in", (time()-start)/60, 'min') def stem_vocab(self, w_lemma=True): start = time() if w_lemma: self.pdata = [[self.lmtzr.lemmatize(self.stemmer.stem(word)) for word in j] for j in self.pdata] print('Stemming and lemmatization done in', (time()-start) / 60, 'min') else: self.pdata = [[self.stemmer.stem(word) for word in j] for j in self.pdata] print('Stemming done in', (time()-start) / 60, 'min') # The primary function that builds the processed data # Once run, the data that can be output is self.pdata # Could also just return pdata later on, if that's a better design choice def get_processed_data(self, author_threshold = 10, load = False): if load: # Placeholder pass else: article_df = self.dataframe # Data cleaning article_df = article_df[~article_df.author.isna() & ~article_df.title.isna()] dct = dict(Counter(article_df.author.tolist())) filtered_users = [key for key in dct.keys() if dct[key]>author_threshold] article_df = article_df[article_df.publication.isin(filtered_users)] if self.subset: article_df = article_df.sample(frac=config_subsample_size) article_df = article_df.set_index('id', drop=True) self.labels = article_df.index.tolist() article_df = article_df.reset_index() self.metadata = article_df.reset_index()[['id', 'title', 'publication', 'author', 'date', 'year', 'month']].copy().reset_index() self.metadata.id = self.metadata.id.fillna(-1).astype('int64') self.metadata.month = self.metadata.month.fillna(-1).astype('int64') self.metadata.year = self.metadata.year.fillna(-1).astype('int64') label_output = pd.DataFrame({'id' : self.labels}).reset_index() if config_write_labels: label_output.to_csv(self.outpath + 'label_mapping_' + str(self.process_code) + '.csv', index=False) self.metadata.to_csv(self.outpath + 'metadata_by_mapping_' + str(self.process_code) + '.csv', index=False) self.pdata = article_df.content.tolist() start = time() self.pdata = [[i for i in re.sub(r'[^\w\s]','',c.lower()).split() if i not in self.stopwords] for c in self.pdata] # Numerical processing from homework assignment 4 self.pdata = [['NUM' if re.match('[0-9]+', word) is not None else word for word in c ] for c in self.pdata] print('Simple splitting done in:', (time()-start)/60, 'min')
def alcohol_abuse_classifier(file_name): tree=ET.parse(file_name) raw_text = tree.find('.//TEXT').text clean_text = re.sub('\\n', ' ', raw_text) clean_text = re.sub('\\t','', clean_text) clean_text = re.sub('[\s]{2,}', ' ', clean_text) sentences = nltk.sent_tokenize(clean_text) hotspot_lines = set() for i in sentences: # filter out anything non-alphabetical characters and a few special characters tokenizer = RegexpTokenizer(r'[a-zA-Z\/\']+') token = tokenizer.tokenize(i) snowball_stemmer = SnowballStemmer("english") stemmed_tokens = [snowball_stemmer.stem(word.lower()) for word in token] drink_score = 0 abuse_score = 0 token_count = len(stemmed_tokens) for j in range(token_count): if stemmed_tokens[j] in stemmed_alcohol: drink_score += 1 # Negation detection in left direction for i in range(1, left_negation+1): if (j >= i) and (stemmed_tokens[j - i] in stemmed_negation): drink_score = 0 break # Negation detection in right direciton for i in range(1, right_negation+1): if (j < token_count - i) and (stemmed_tokens[j + i] in stemmed_negation): drink_score = 0 break # Modifier detection in left direction for i in range(1, left_modifier+1): if (j >= i) and (stemmed_tokens[j - i] in stemmed_alcohol_modifer): abuse_score += 1 # Modifier detection in right direciton for i in range(1, right_modifier+1): if (j < token_count - i) and (stemmed_tokens[j + i] in stemmed_alcohol_modifer): abuse_score += 1 # Mental Health Detection elif stemmed_tokens[j] in stemmed_alcohol_mental: abuse_score += 1 if drink_score >= 1 and abuse_score >= 1: hotspot_lines.add(i) if hotspot_lines: return '<ALCOHOL-ABUSE met="met" />' else: return '<ALCOHOL-ABUSE met="not met" />'
def test_russian(self): # Russian words both consisting of Cyrillic # and Roman letters can be stemmed. stemmer_russian = SnowballStemmer("russian") assert stemmer_russian.stem("авантненькая") == "авантненьк" assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k"
def test_short_strings_bug(self): stemmer = SnowballStemmer('english') assert stemmer.stem("y's") == 'y'
print(c.most_common()) ''' #COLLECTING TOP 10 WORDS IN TWEETS tweets_lst = [] punc = string.punctuation for item in data: tweets = item['text'] for i in tweets.split(): if '@' not in i and 'http' not in i: table = str.maketrans( {key: None for key in string.punctuation} ) #https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python text_nopunc = i.translate(table) text_stem = ss.stem(text_nopunc) #print(text_nopunc) tweets_lst.append(text_nopunc.lower()) stopwords = nltk.corpus.stopwords.words('english') #Include unique twitter words in stopwords #new_words = ['fortnite', 'one', 'follow', 'win', 'vbuck', 'retweet', 'enter', 'give', 'pick', 'play'] #for n in new_words: #stopwords.append(n) tweets_final = [] for i in tweets_lst: if i not in stopwords and len(i) > 2: tweets_final.append(i)
class Importer(object): logging.basicConfig(format='%(asctime)s : %(levelname)s :: %(message)s', level=logging.DEBUG) def __init__(self, arg_document_count_limit=sys.maxint, arg_process_text_part=True, arg_process_html_part=False, arg_process_both_empty=False): self.document_count_limit = arg_document_count_limit self.process_text_part = arg_process_text_part self.process_html_part = arg_process_html_part self.process_both_empty = arg_process_both_empty self.stemmer = SnowballStemmer("english") pass # http://brandonrose.org/clustering (with some modifications) @staticmethod def strip_proppers(arg_text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it'sown token tokens = [word for sent in nltk.sent_tokenize(arg_text) for word in nltk.word_tokenize(sent) if word.islower()] # todo get the startswiths and endswiths right here return "".join( [" " + i if not i.startswith("'") and not i.startswith("/") and not i.endswith( "=") and i not in string.punctuation else i for i in tokens]).strip() # http://brandonrose.org/clustering def tokenize_and_stem(self, arg_text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it'sown token tokens = [current_word for sent in nltk.sent_tokenize(arg_text) for current_word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [self.stemmer.stem(token) for token in filtered_tokens] return stems def process_folder(self, arg_folder, arg_bulk_upload, arg_document_type, arg_buffer_limit, arg_server, arg_index_name, arg_kmeans_dictionary): document_count = 0 document_buffer = [] indexed_count = 0 error_count = 0 for root, subdirectories, files in os.walk(arg_folder): for current in files: if document_count < self.document_count_limit: current_full_file_name = os.path.join(root, current) # logging.debug("%d %s", document_count, current_full_file_name) current_json, document_id = self.get_json(current_full_file_name, arg_process_text_part=self.process_text_part, arg_process_html_part=self.process_html_part, arg_process_both_empty=self.process_both_empty, arg_kmeans_cluster_dictionary=arg_kmeans_dictionary) # logging.debug(current_json) document_count += 1 try: if arg_bulk_upload: wrapped = {'_type': arg_document_type, '_source': current_json} document_buffer.append(wrapped) if len(document_buffer) == arg_buffer_limit: try: index_result = elasticsearch.helpers.bulk(arg_server, document_buffer, index=arg_index_name, request_timeout=1000) logging.debug(index_result) indexed_count += len(document_buffer) document_buffer = [] except elasticsearch.exceptions.ConnectionTimeout as connectionTimeout: logging.warn(connectionTimeout) document_buffer = [] else: index_result = arg_server.index(index=arg_index_name, doc_type=arg_document_type, body=current_json, id=document_id) indexed_count += 1 logging.debug("id: %s, result: %s", document_id, index_result) except elasticsearch.exceptions.SerializationError as serializationError: logging.warn(serializationError) error_count += 1 # need to flush the pending buffer if arg_bulk_upload and len(document_buffer) > 0: index_result = elasticsearch.helpers.bulk(arg_server, document_buffer, index=arg_index_name) logging.debug(index_result) target_encoding = 'utf-8' # https://groups.google.com/forum/#!topic/microsoft.public.outlookexpress.general/oig7-xNFISg clean_address_tokens = ['=?us-ascii?Q?', '=0D=0A_=28', '=?utf-8?Q?', '=29?=', '=0D=0A'] def clean_address(self, argvalue): result = str(argvalue) for token in self.clean_address_tokens: if token in result: result = result.replace(token, ' ') return result.lower().strip() @staticmethod def get_references(current_file): result = {} with open(current_file, 'rb') as fp: message = pyzmail.message_from_file(fp) if 'Message-Id' in message.keys(): result['message-id'] = message['Message-Id'] elif 'Message-ID' in message.keys(): result['message-id'] = message['Message-ID'] else: logging.warn('no message id in file %s', current_file) if 'References' in message.keys(): references = message['References'].split(' ') result['references'] = references return result def get_json(self, current_file, arg_process_text_part, arg_process_html_part, arg_process_both_empty, arg_kmeans_cluster_dictionary): result = {'original_file': current_file} with open(current_file, 'rb') as fp: message = pyzmail.message_from_file(fp) # todo clean up internal whitespace senders = message.get_addresses('from') result['sender'] = [item[i] for i in [0, 1] for item in senders] result['short_sender'] = [item.split('@')[0] for item in result['sender']] clean_senders = [self.clean_address(item[1]) for item in senders] result['clean_sender'] = clean_senders # todo clean up internal whitespace recipients = message.get_addresses('to') + message.get_addresses('cc') + message.get_addresses('bcc') result['recipient'] = recipients result['party'] = list( ['{name} = {address}'.format(name=item[0], address=item[1]) for item in senders + recipients]) result['clean_recipient'] = [self.clean_address(item[1]) for item in recipients] result['recipient'] = [item[i] for i in [0, 1] for item in recipients] result['short_recipient'] = [item.split('@')[0] for item in result['clean_recipient']] subject = message.get('subject') result['subject'] = '' if subject is None else subject.decode('iso-8859-1').encode(self.target_encoding) raw_date = message.get('date') if raw_date is not None: try: result['date'] = dateutil.parser.parse(raw_date) except ValueError as valueError: # todo find a way to deal with these special cases? # we occasionally get a string the parser won't parse e.g. # Wed, 17 Dec 2008 12:35:42 -0700 (GMT-07:00) # and we need to drop off the trailing time zone and try to parse again logging.warn('%s %s %s', raw_date, valueError, current_file) pieces = str(raw_date).split('(') result['date'] = dateutil.parser.parse(pieces[0]) else: # todo add special code to handle these? logging.warn('no date: %s ', message) text_part = message.text_part if text_part is not None and arg_process_text_part: charset = text_part.charset payload = text_part.get_payload() if charset is not None: try: body = payload.decode(charset, 'ignore').encode(self.target_encoding) except LookupError as lookupError: if text_part.charset == 'iso-8859-8-i': body = payload.decode('iso-8859-8', 'ignore').encode(self.target_encoding) else: body = payload.decode('utf-8', 'ignore').encode(self.target_encoding) logging.warn('lookup error %s', lookupError) else: body = payload.decode('utf-8', 'ignore').encode(self.target_encoding) result['body'] = body short_file_name = os.path.basename(current_file) result['kmeans_cluster'] = arg_kmeans_cluster_dictionary[short_file_name] elif message.html_part is not None and arg_process_html_part: payload = message.html_part.part.get_payload() payload_text = bs4.BeautifulSoup(payload, 'lxml').get_text().strip() charset = message.html_part.charset if message.html_part.charset is not None else 'utf-8' result['body'] = payload_text.decode(charset, 'ignore').encode(self.target_encoding) elif arg_process_both_empty: logging.warn('both text_part and html_part are None: %s', current_file) else: logging.warn('not processing %s', current_file) if 'body' in result.keys(): if len(result['body']) == 0: result['empty_body'] = True if 'Message-Id' in message.keys(): result['message-id'] = message['Message-Id'] if 'In-Reply-To' in message.keys(): result['in-reply-to'] = message['In-Reply-To'] if 'References' in message.keys(): result['references'] = message['References'].split(' ') md5 = hashlib.md5() with open(current_file, 'rb') as fp: md5.update(fp.read()) return result, md5.hexdigest()
def problem7b(debate): porterStemmer = PorterStemmer() snowballStemmer = SnowballStemmer("english", ignore_stopwords=False) lancasterStemmer = LancasterStemmer() # cachedStopWords = stopwords.words("english") tokenizer = RegexpTokenizer(r'\w+') stemDict = {'LEHRER': {},'OBAMA': {}, 'ROMNEY': {}} LEHRER = debate['LEHRER'] OBAMA = debate['OBAMA'] ROMNEY = debate['ROMNEY'] LEHRER = "".join(LEHRER) LEHRER = tokenizer.tokenize(LEHRER) LEHRER = ' '.join([word.lower() for word in LEHRER if word not in stopwords.words("english")]) pstemmed_words = ' '.join([porterStemmer.stem(word) for word in LEHRER.split(' ')]) stemDict['LEHRER'].update({'porterStemmer':pstemmed_words}) stemDict11 = stemDict['LEHRER']['porterStemmer'] print("\n\n\nLEHRER:porterStemmer\n\n ",stemDict11) sstemmed_words = ' '.join([snowballStemmer.stem(word) for word in LEHRER.split(' ')]) stemDict['LEHRER'].update({'snowballStemmer':sstemmed_words}) stemDict12 = stemDict['LEHRER']['porterStemmer'] print("\n\n\nLEHRER:snowballStemmer \n\n ",stemDict12) lstemmed_words = ' '.join([lancasterStemmer.stem(word) for word in LEHRER.split(' ')]) stemDict['LEHRER'].update({'lancasterStemmer':lstemmed_words}) stemDict13 = stemDict['LEHRER']['lancasterStemmer'] print("\n\n\nLEHRER:lancasterStemmer \n\n ",stemDict13) OBAMA = "".join(OBAMA) OBAMA = tokenizer.tokenize(OBAMA) OBAMA = ' '.join([word.lower() for word in OBAMA if word not in stopwords.words("english")]) pstemmed_words = ' '.join([porterStemmer.stem(word) for word in OBAMA.split(' ')]) stemDict['OBAMA'].update({'porterStemmer':pstemmed_words}) stemDict21 = stemDict['OBAMA']['porterStemmer'] print("\n\n\nOBAMA:porterStemmer\n\n ",stemDict21) sstemmed_words = ' '.join([snowballStemmer.stem(word) for word in OBAMA.split(' ')]) stemDict['OBAMA'].update({'snowballStemmer':sstemmed_words}) stemDict22 = stemDict['OBAMA']['porterStemmer'] print("\n\n\nOBAMA:snowballStemmer \n\n ",stemDict22) lstemmed_words = ' '.join([lancasterStemmer.stem(word) for word in OBAMA.split(' ')]) stemDict['OBAMA'].update({'lancasterStemmer':lstemmed_words}) stemDict23 = stemDict['OBAMA']['lancasterStemmer'] print("\n\n\nOBAMA:lancasterStemmer \n\n ",stemDict23) ROMNEY = "".join(ROMNEY) ROMNEY = tokenizer.tokenize(ROMNEY) ROMNEY = ' '.join([word.lower() for word in ROMNEY if word not in stopwords.words("english")]) pstemmed_words = ' '.join([porterStemmer.stem(word) for word in ROMNEY.split(' ')]) stemDict['ROMNEY'].update({'porterStemmer':pstemmed_words}) stemDict31 = stemDict['ROMNEY']['porterStemmer'] print("\n\n\nROMNEY:porterStemmer\n\n ",stemDict31) sstemmed_words = ' '.join([snowballStemmer.stem(word) for word in ROMNEY.split(' ')]) stemDict['ROMNEY'].update({'snowballStemmer':sstemmed_words}) stemDict32 = stemDict['ROMNEY']['porterStemmer'] print("\n\n\nROMNEY:snowballStemmer \n\n ",stemDict32) lstemmed_words = ' '.join([lancasterStemmer.stem(word) for word in ROMNEY.split(' ')]) stemDict['ROMNEY'].update({'lancasterStemmer':lstemmed_words}) stemDict33 = stemDict['ROMNEY']['lancasterStemmer'] print("\n\n\nROMNEY:lancasterStemmer \n\n ",stemDict33) return stemDict#{stemDict['LEHRER']['porterStemmer']:stemDict11, stemDict['OBAMA']['porterStemmer']:stemDict21 ,stemDict['ROMNEY']['porterStemmer']:stemDict31};
from nltk.corpus import stopwords sw = stopwords.words("english") print 'Amount of English stopwords: ', len(sw) sw = stopwords.words("russian") print 'Amount of Russian stopwords: ', len(sw) for word in sw: print word from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") stemmer.stem("responsiveness")
def ball_stemming(text): stemmer = SnowballStemmer("english") tokens = word_tokenize(text) stemmas = [stemmer.stem(token) for token in tokens] return stemmas
email_list = [string1,string2] vectorizer.fit(email_list) bag_of_words = vectorizer.transform(email_list) print (bag_of_words) print (vectorizer.vocabulary_.get('great')) from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") stemmer.stem("responsiveness") stem_rus = SnowballStemmer("russian") stem_rus.stem("Плотный") in_data = "Плотный Продам свитер из Англии, фирма Woolovers, 100% хлопок. Не подошел размер. Свитер идет на 56-58 примерно размер. Плотный, не тонкий. Отдаю за свою цену, перезакажу меньший размер." list = in_data.split(" ") stem_rus.stem(list[2]) from sklearn.feature_extraction.text import CountVectorizer from nltk.stem.snowball import FrenchStemmer
# doc needs to be a spacy Doc object txt = [token.lemma_ for token in doc if not token.is_stop] # Word2Vec uses context words to learn the vector representation of a target word, # if a sentence is only one or two words long, # the benefit for the training is very small if len(txt) > 2: return ' '.join(txt) brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['sentence']) #It's actually paragraph, but whatever. t = time() txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)] txt_stem = [] for row in df['sentence']: cleaned_sent = re.sub("[^A-Za-z']+", ' ', str(row)).lower() txt_stem.append([ Snow.stem(word) for word in cleaned_sent]) print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2))) df_clean = pd.DataFrame({'clean': txt.extend(txt_stem)}) df_clean = df_clean.dropna().drop_duplicates() df_clean.to_sql('cleaned_TDS', conn, if_exists='replace', index=False) conn.commit() conn.close()
## extract sentences from paragraph fullSents = nltk.sent_tokenize(content) ## extract words from sentences ## find parts of speech of words wordsInSents = list() posInSents = list() for sent in fullSents: sent_token = nltk.word_tokenize(sent) # word tokenize the sentence sent_pos = nltk.pos_tag(sent_token) # tag with part of speech stemWords = list() # create list to store stemmed words onlyPOS = list() # create list to store sentence parts of speech for item in sent_pos: word = item[0] stemWords.append(p_stemmer.stem( word.lower())) # lower case and stem words onlyPOS.append(item[1]) wordsInSents.append(stemWords) posInSents.append(onlyPOS) ## go through each word and the pos of each word ## to see if it matches the 600 feature names of the ## finalized model to create the 600 element binary ## vector featureNames = np.load("../Data/FeatureName_finalized_model.npy") featureMat = np.zeros((len(wordsInSents), len(featureNames))) for i in range(0, len(wordsInSents)): words = wordsInSents[i] POSes = posInSents[i] for j in range(0, len(featureNames)): name = featureNames[j]
rtext = re.sub(r'[Yy]ear', '', rtext) return rtext #cleaning the text for each case cleaned_text = [clean(each_case) for each_case in text] cleaned_text = [clean_number(each_case) for each_case in cleaned_text] #creating a simple tfidf with just unigram #remove stopwords stopwords = nltk.corpus.stopwords.words('english') #tokenize each word and stem tokenized_text = [nltk.word_tokenize(each_case) for each_case in cleaned_text] tokenized_text = [[ stemmer.stem(word) for word in each_case if word not in stopwords ] for each_case in tokenized_text] tot_text = list(chain.from_iterable(tokenized_text)) fdist = FreqDist(tot_text) wordList = fdist.values() wordArray = np.array(wordList) print '50% quantile word count of', np.percentile(wordArray, 50) print fdist.most_common(30) #plotting fdist on a cumulative chart fdist.plot(30, cumulative=True) #plotting fdist on a non cumulative chart fdist.plot(30) print 'seldom appearing words:', fdist.hapaxes() tfidf_text = []
predictedTest, tweetsTest, topicTest = obtainPredictedAndTweets('TweetsDownloaded/testData/test.txt') # http://www.nltk.org/api/nltk.tokenize.html tknzr = TweetTokenizer() stemmer = SnowballStemmer("english") vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = 'english' ) tw = [] for t,statement in enumerate(tweets): tw.append(' '.join(stemmer.stem(i) for i in tknzr.tokenize(statement))) twt = [] for t,statement in enumerate(tweetsTest): twt.append(' '.join(stemmer.stem(i) for i in tknzr.tokenize(statement))) train = tw sentimentTrain = predicted test = twt sentimentTest = predictedTest train, test, sentimentTrain, sentimentTest = cross_validation.train_test_split(tw, predicted, test_size=0.3, random_state=15) train_data_features= vectorizer.fit_transform(train) train_data_features = train_data_features.toarray()
print "We read the file contents. Size %d bytes" % (len(fileContents)) tokenizer = RegexpTokenizer(r'[a-zA-Z]+') words = tokenizer.tokenize(fileContents) # print words print "No. of words before prep : %d " % (len(words)) good_words = [] #remove stopwords and numbers for w in words: if w.lower() not in stopwords.words(): if not w.isdigit(): good_words.append(w.lower()) print "No. of words without puctuation: %d" % (len(good_words)) stemmer = SnowballStemmer("english") stemmed_words = [stemmer.stem(w) for w in good_words] print "No. of stemmed words : %d" % (len(stemmed_words)) fdist = nltk.FreqDist(stemmed_words) print "Top 50 Words" print fdist.most_common(50) #print words
def keyurFeatures(rawSentence): # TODO get a proper lexicon of intensifiers intensifiers = set([]) with open('dataset/lexicon/intensifiers.txt', 'r') as intfile: for line in intfile: for word in line.split(): intensifiers.add(word) rawSentence = nltk.word_tokenize(rawSentence) sentence = [] for token in rawSentence: match = re.search(r'^[.,?!-()";:\']+$', token) if match == None: sentence.append(token) sentence = nltk.pos_tag(sentence) stemmer = SnowballStemmer("english") features = [] lexicon = loadLexicon('dataset/lexicon/subjclueslen1-HLTEMNLP05.tff') for i in range(len(sentence)): feature = [] # the word token and part of speech feature.append(stemmer.stem(sentence[i][0])) feature.append(sentence[i][1]) # word context: before, this, after if i > 0: feature.append(sentence[i - 1][0]) else: feature.append('') feature.append(sentence[i][0]) if i < len(sentence) - 1: feature.append(sentence[i + 1][0]) else: feature.append('') try: feature.append(lexicon[sentence[i][0]]['priorpolarity']) except KeyError: feature.append('none') try: feature.append(lexicon[sentence[i][0]]['type']) except KeyError: feature.append('none') if i > 0: prevTag = sentence[i - 1][1] # preceded by adjective if prevTag[0] == 'J' and prevTag[1] == 'J': feature.append(True) else: feature.append(False) # preceded by adverb other than not if prevTag[0] == 'R' and prevTag[1] == 'B' and sentence[ i - 1][0].lower() != 'not': feature.append(True) else: feature.append(False) # preceded by intensifier if sentence[i - 1][0].lower() in intensifiers: feature.append(True) else: feature.append(False) else: feature.append(False) feature.append(False) feature.append(False) # is intensifier if sentence[i][0].lower() in intensifiers: feature.append(True) else: feature.append(False) features.append(feature) return features
####Remove stop words from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) tokenized_words = [word for word in tokenized_words if word not in stop_words] tokenized_words ###reduced to 27 ###Stemming from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english", ignore_stopwords=True) tokenized_words = [stemmer.stem(word) for word in tokenized_words] tokenized_words final_synopsis = [] for r in range(len(movie_synopsis_nomissing)): input_document = movie_synopsis_nomissing[r].lower() input_document = re.sub(r'\d+', '', input_document) input_document = strip_punctuation(input_document) input_words = word_tokenize(input_document) input_words = [word for word in input_words if word not in stop_words] input_words = [stemmer.stem(word) for word in input_words] clean_synopsis = ' '.join(map(str, input_words)) final_synopsis.append(clean_synopsis) from sklearn.feature_extraction.text import CountVectorizer
# In[7]: print("Cleaning dataset...") t0 = time() stopwords = load_stopwords(stopword_file) stemmer = SnowballStemmer('english', ignore_stopwords=True) stemmer.stopwords = stopwords clean_data_samples = [] for sample in data_samples: clean_sample = '' for token in re.split("'(?!(d|m|t|ll|ve)\W)|[.,\-_!?:;()0-9@=+^*`~#$%&| \t\n\>\<\"\\\/\[\]{}]+", sample.lower().decode('utf-8')): if not token or token in stopwords: continue if stemming: token = stemmer.stem(token) if len(token) < minlength: continue clean_sample = clean_sample + ' ' + token clean_data_samples.append(clean_sample) print("done in %0.3fs." % (time() - t0)) # # TF-IDF / TF Vectors # In[8]: # tf (raw term count) print("Extracting tf features...") tf_vectorizer =CountVectorizer(max_df=0.95, stop_words='english', max_features=n_features)
plt.axis('off') plt.show() stemmer = SnowballStemmer("english", ignore_stopwords=True) text_1 = df_2["tweet"].to_csv() tokenizer = RegexpTokenizer(r"\w+") word_tokens = tokenizer.tokenize(text_1) filtered_sentence = [w for w in word_tokens if not w in stopwords] filtered_sentence = [] for w in word_tokens: if w not in stopwords: lemmatizer = WordNetLemmatizer() w = lemmatizer.lemmatize(w) w = stemmer.stem(w) filtered_sentence.append(w) split_it = str(filtered_sentence).split() from collections import Counter Counter = Counter(split_it) most_occur = Counter.most_common(60) frequentwords = pd.DataFrame(most_occur).rename(columns={0:"words", 1:"Frequencies"}).\ sort_values(by="Frequencies").reset_index(drop=True) frequentwords = frequentwords.drop(index=[6, 7, 15, 26, 41, 47, 56, 57]).reset_index(drop=True) print(frequentwords) fig = px.bar(frequentwords, x="Frequencies", y="words", orientation="h", height=1000,
def cli_main(): #parser = argparse.ArgumentParser(description=metrics_description, formatter_class=argparse.RawDescriptionHelpFormatter) parser = argparse.ArgumentParser(description="predictor") parser.add_argument('--config-file', type=str, help='config file with metric parameters') parser.add_argument('--metrics', type=str, help='comma-separated string of metrics') parser.add_argument('--aggregate', type=bool, help='whether to aggregate scores') parser.add_argument('--jsonl-file', type=str, help='input jsonl file to score') parser.add_argument('--article-file', type=str, help='input article file') parser.add_argument('--summ-file', type=str, help='input summary file') parser.add_argument('--ref-file', type=str, help='input reference file') parser.add_argument('--output-file', type=str, help='output file') parser.add_argument( '--eos', type=str, help='EOS for ROUGE (if reference not supplied as list)') args = parser.parse_args() # ===================================== # INITIALIZE METRICS gin.parse_config_file(args.config_file) toks_needed = set() metrics = [x.strip() for x in args.metrics.split(",")] metrics_dict = {} if "rouge" in metrics: from summ_eval.rouge_metric import RougeMetric metrics_dict["rouge"] = RougeMetric() toks_needed.add("line_delimited") if "bert_score" in metrics: from summ_eval.bert_score_metric import BertScoreMetric bert_score_metric = BertScoreMetric() metrics_dict["bert_score"] = bert_score_metric toks_needed.add("space") if "mover_score" in metrics: from summ_eval.mover_score_metric import MoverScoreMetric mover_score_metric = MoverScoreMetric() metrics_dict["mover_score"] = mover_score_metric toks_needed.add("space") if "chrf" in metrics: from summ_eval.chrfpp_metric import ChrfppMetric metrics_dict["chrf"] = ChrfppMetric() toks_needed.add("space") if "meteor" in metrics: from summ_eval.meteor_metric import MeteorMetric metrics_dict["meteor"] = MeteorMetric() toks_needed.add("space") if "bleu" in metrics: from summ_eval.bleu_metric import BleuMetric metrics_dict["bleu"] = BleuMetric() toks_needed.add("space") if "cider" in metrics: from summ_eval.cider_metric import CiderMetric metrics_dict["cider"] = CiderMetric() toks_needed.add("stem") if "s3" in metrics: from summ_eval.s3_metric import S3Metric metrics_dict["s3"] = S3Metric() toks_needed.add("stem") if "rouge_we" in metrics: from summ_eval.rouge_we_metric import RougeWeMetric metrics_dict["rouge_we"] = RougeWeMetric() toks_needed.add("stem") if "stats" in metrics: from summ_eval.data_stats_metric import DataStatsMetric metrics_dict['stats'] = DataStatsMetric() toks_needed.add("spacy") if "sms" in metrics: from summ_eval.sentence_movers_metric import SentenceMoversMetric metrics_dict['sms'] = SentenceMoversMetric() toks_needed.add("spacy") if "summaqa" in metrics: from summ_eval.summa_qa_metric import SummaQAMetric metrics_dict['summaqa'] = SummaQAMetric() toks_needed.add("spacy") toks_needed.add("space") if "syntactic" in metrics: from summ_eval.syntactic_metric import SyntacticMetric metrics_dict["syntactic"] = SyntacticMetric() toks_needed.add("space") if "supert" in metrics: from summ_eval.supert_metric import SupertMetric metrics_dict['supert'] = SupertMetric() toks_needed.add("space") if "blanc" in metrics: from summ_eval.blanc_metric import BlancMetric metrics_dict['blanc'] = BlancMetric() toks_needed.add("space") # ===================================== # ===================================== # READ INPUT print("Reading the input") ids = [] articles = [] references = [] summaries = [] bad_lines = 0 if args.jsonl_file is not None: try: with open(args.jsonl_file) as inputf: for count, line in enumerate(inputf): try: data = json.loads(line) try: ids.append(data['id']) except: pass if len(data['decoded']) == 0: bad_lines += 1 continue summaries.append(data['decoded']) references.append(data['reference']) if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics: try: articles.append(data['text']) except: raise ValueError("You specified summaqa and stats, which" \ "require input articles, but we could not parse the file!") except: bad_lines += 1 except Exception as e: print("Input did not match required format") print(e) sys.exit() print(f"This many bad lines encountered during loading: {bad_lines}") if args.summ_file is not None: with open(args.summ_file) as inputf: summaries = inputf.read().splitlines() if args.ref_file is not None: with open(args.ref_file) as inputf: references = inputf.read().splitlines() if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics: if args.article_file is None and len(articles) == 0: raise ValueError("You specified summaqa and stats, which" \ "require input articles, but we could not parse the file!") if len(articles) > 0: pass else: with open(args.article_file) as inputf: articles = inputf.read().splitlines() if len(ids) == 0: ids = list(range(0, len(summaries))) # ===================================== # ===================================== # TOKENIZATION print("Preparing the input") references_delimited = None summaries_delimited = None if len(references) > 0: if isinstance(references[0], list): if "line_delimited" in toks_needed: references_delimited = ["\n".join(ref) for ref in references] if "space" in toks_needed: references_space = [" ".join(ref) for ref in references] elif args.eos is not None: if "line_delimited" not in toks_needed: raise ValueError( 'You provided a delimiter but are not using a metric which requires one.' ) if args.eos == "\n": references_delimited = [ ref.split(args.eos) for ref in references ] else: references_delimited = [ f"{args.eos}\n".join(ref.split(args.eos)) for ref in references ] elif "line_delimited" in toks_needed: references_delimited = references if "space" in toks_needed: references_space = references if isinstance(summaries[0], list): if "line_delimited" in toks_needed: summaries_delimited = ["\n".join(summ) for summ in summaries] if "space" in toks_needed: summaries_space = [" ".join(summ) for summ in summaries] elif args.eos is not None: if "line_delimited" not in toks_needed: raise ValueError( 'You provided a delimiter but are not using a metric which requires one.' ) if args.eos == "\n": summaries_delimited = [ref.split(args.eos) for ref in summaries] else: summaries_delimited = [ f"{args.eos}\n".join(ref.split(args.eos)) for ref in summaries ] elif "line_delimited" in toks_needed: summaries_delimited = summaries if "space" in toks_needed: summaries_space = summaries if "stem" in toks_needed: tokenizer = RegexpTokenizer(r'\w+') stemmer = SnowballStemmer("english") if isinstance(summaries[0], list): summaries_stemmed = [[ stemmer.stem(word) for word in tokenizer.tokenize(" ".join(summ)) ] for summ in summaries] references_stemmed = [[ stemmer.stem(word) for word in tokenizer.tokenize(" ".join(ref)) ] for ref in references] else: summaries_stemmed = [[ stemmer.stem(word) for word in tokenizer.tokenize(summ) ] for summ in summaries] references_stemmed = [[ stemmer.stem(word) for word in tokenizer.tokenize(ref) ] for ref in references] summaries_stemmed = [" ".join(summ) for summ in summaries_stemmed] references_stemmed = [" ".join(ref) for ref in references_stemmed] if "spacy" in toks_needed: try: nlp = spacy.load('en_core_web_md') except OSError: print( 'Downloading the spacy en_core_web_md model\n' "(don't worry, this will only happen once)", file=stderr) from spacy.cli import download download('en_core_web_md') nlp = spacy.load('en_core_web_md') disable = ["tagger", "textcat"] if "summaqa" not in metrics: disable.append("ner") if isinstance(summaries[0], list): summaries_spacy = [ nlp(" ".join(text), disable=disable) for text in summaries ] else: summaries_spacy = [ nlp(text, disable=disable) for text in summaries ] if "stats" in metrics: summaries_spacy_stats = [[tok.text for tok in summary] for summary in summaries_spacy] if "sms" in metrics: if isinstance(references[0], list): references_spacy = [ nlp(" ".join(text), disable=disable) for text in references ] else: references_spacy = [ nlp(text, disable=disable) for text in references ] if "summaqa" in metrics or "stats" in metrics: if isinstance(articles[0], list): input_spacy = [ nlp(" ".join(text), disable=disable) for text in articles ] else: input_spacy = [nlp(text, disable=disable) for text in articles] if "stats" in metrics: input_spacy_stats = [[tok.text for tok in article] for article in input_spacy] if "supert" in metrics or "blanc" in metrics: inputs_space = articles # ===================================== # ===================================== # GET SCORES if args.aggregate: final_output = dict() else: final_output = defaultdict(lambda: defaultdict(int)) #import pdb;pdb.set_trace() for metric, metric_cls in metrics_dict.items(): print(f"Calculating scores for the {metric} metric.") try: if metric == "rouge": output = metric_cls.evaluate_batch(summaries_delimited, references_delimited, aggregate=args.aggregate) # only rouge uses this input so we can delete it del references_delimited del summaries_delimited elif metric in ('bert_score', 'mover_score', 'chrf', 'meteor', 'bleu'): output = metric_cls.evaluate_batch(summaries_space, references_space, aggregate=args.aggregate) elif metric in ('s3', 'rouge_we', 'cider'): output = metric_cls.evaluate_batch(summaries_stemmed, references_stemmed, aggregate=args.aggregate) elif metric == "sms": output = metric_cls.evaluate_batch(summaries_spacy, references_spacy, aggregate=args.aggregate) elif metric in ('summaqa', 'stats', 'supert', 'blanc'): if metric == "summaqa": output = metric_cls.evaluate_batch( summaries_space, input_spacy, aggregate=args.aggregate) elif metric == "stats": output = metric_cls.evaluate_batch( summaries_spacy_stats, input_spacy_stats, aggregate=args.aggregate) elif metric in ('supert', 'blanc'): output = metric_cls.evaluate_batch( summaries_space, inputs_space, aggregate=args.aggregate) if args.aggregate: final_output.update(output) else: ids = list(range(0, len(ids))) for cur_id, cur_output in zip(ids, output): final_output[cur_id].update(cur_output) except Exception as e: print(e) print(f"An error was encountered with the {metric} metric.") # ===================================== # ===================================== # OUTPUT SCORES metrics_str = "_".join(metrics) #json_file_end = args.jsonl_file.split("/")[-1] json_file_end = args.jsonl_file.replace("/", "_") with open( f"outputs/{args.output_file}_{json_file_end}_{metrics_str}.jsonl", "w") as outputf: if args.aggregate: json.dump(final_output, outputf) else: for key, value in final_output.items(): value["id"] = key json.dump(value, outputf) outputf.write("\n")
def load_references(input_file, sep_doc_id=':', sep_ref_keyphrases=',', normalize_reference=False, language="en", encoding='utf-8'): """Load a reference file. Reference file can be either in json format or in the SemEval-2010 official format. Args: input_file (str): path to the reference file. sep_doc_id (str): the separator used for doc_id in reference file, defaults to ':'. sep_ref_keyphrases (str): the separator used for keyphrases in reference file, defaults to ','. normalize_reference (bool): whether to normalize the reference keyphrases using stemming, default to False. language (str): language of the input documents (used for computing the stems), defaults to 'en' (english). encoding (str): file encoding, default to utf-8. """ logging.info('loading reference keyphrases from {}'.format(input_file)) references = defaultdict(list) # open input file with codecs.open(input_file, 'r', encoding) as f: # load json data if input_file.endswith('.json'): references = json.load(f) for doc_id in references: references[doc_id] = [keyphrase for variants in references[doc_id] for keyphrase in variants] # or load SemEval-2010 file else: for line in f: cols = line.strip().split(sep_doc_id) doc_id = cols[0].strip() keyphrases = cols[1].strip().split(sep_ref_keyphrases) for v in keyphrases: if '+' in v: for s in v.split('+'): references[doc_id].append(s) else: references[doc_id].append(v) # normalize reference if needed if normalize_reference: # initialize stemmer stemmer = SnowballStemmer("porter") if language != 'en': stemmer = SnowballStemmer(ISO_to_language[language], ignore_stopwords=True) for doc_id in references: for i, keyphrase in enumerate(references[doc_id]): stems = [stemmer.stem(w) for w in keyphrase.split()] references[doc_id][i] = ' '.join(stems) return references
from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer from string import punctuation # stopwords nltk.download('stopwords') stop_words = list(set(stopwords.words('english'))) punc = list(set(punctuation)) stop_words.extend(punc) stop_words.extend(["'s", "'d", "'m"]) print(stop_words) for x in combined: x = word_tokenize(x) stemmer = SnowballStemmer('english') x = [(stemmer.stem(i)).lower() for i in x] x = [i for i in x if x not in stop_words] combined_features.append(x) # mapping frequencies with words from gensim import corpora dictionary = corpora.Dictionary(combined_features) print(dictionary) id = [] for x in combined_features: temp = [dictionary.token2id[j] for j in x] id.append(temp) # Creating MLP
"not", "h/o", "never", "none", "nor", "non", "rare", "previous", "prior", "history", "denies", "negative"] # Run the stemmer on feature words snowball_stemmer = SnowballStemmer("english") stemmed_alcohol = [snowball_stemmer.stem(word) for word in ALCOHOL] stemmed_alcohol_modifer = [snowball_stemmer.stem(word) for word in ALCOHOL_MODIFIER] stemmed_alcohol_mental = [snowball_stemmer.stem(word) for word in ALCOHOL_MENTAL] stemmed_negation = [snowball_stemmer.stem(word) for word in NEGATION] # MAIN SCRIPT # Distance to look for feature in certain direction left_negation = 5 right_negation = 3 left_modifier = 2 right_modifier = 2 # Read file def alcohol_abuse_classifier(file_name): tree=ET.parse(file_name)
def get_stem(content): stemmer = SnowballStemmer('english') for k in range(len(content)): content[k] = stemmer.stem(content[k]).encode('utf-8')
def tokenize_stem(train_texts): tokens = tokenize(train_texts) stemmer = SnowballStemmer('english') stemmed_tokens = [stemmer.stem(token) for token in tokens] return stemmed_tokens
def features(tokens, index, history): """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase return { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'nextnextpos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, }