def extract_candidates_doc(doc, phrase_list, idf_vec, training_size=450): #vocab = set(phrase_list) idf_dic = {} #print "phrase list len", len(phrase_list) #print "len idf_vec", len(idf_vec) for i, phrase in enumerate(phrase_list): idf_dic[phrase] = idf_vec[i] noun_phrases = set() print "--extracting NP" noun_phrases = set( [lemmatize(phrase) for phrase in extract_candidate_chunks(doc)]) vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize) analyzer = vectorizer.build_analyzer() phrases = list( set([ phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases) ])) doc = preprocess(doc) #print "candidate phrases", phrases #tfidf = [] #first_occurrence = [] #entropy = [] #length = [] doc_len = len(doc) entropy = get_entropy_doc(doc, phrases) # get feature vectors features = [] for i, phrase in enumerate(phrases): first_occurrence = doc.find(phrase) / doc_len tf = doc.count(phrase) if phrase in idf_dic: tfidf = tf * idf_dic[phrase] else: tfidf = tf * log10(training_size) feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i]) features.append(feature_vec) return phrases, features
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450): #vocab = set(phrase_list) idf_dic = {} #print "phrase list len", len(phrase_list) #print "len idf_vec", len(idf_vec) for i, phrase in enumerate(phrase_list): idf_dic[phrase] = idf_vec[i] noun_phrases = set() print "--extracting NP" noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)]) vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize) analyzer = vectorizer.build_analyzer() phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)])) doc = preprocess(doc) #print "candidate phrases", phrases #tfidf = [] #first_occurrence = [] #entropy = [] #length = [] doc_len = len(doc) entropy = get_entropy_doc(doc, phrases) # get feature vectors features = [] for i, phrase in enumerate(phrases): first_occurrence = doc.find(phrase) / doc_len tf = doc.count(phrase) if phrase in idf_dic: tfidf = tf * idf_dic[phrase] else: tfidf = tf * log10(training_size) feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i]) features.append(feature_vec) return phrases, features
def lemmatize_content(text): tmp = [] lemmatizer = WordNetLemmatizer() for word in text.split(): tmp.append(lemmatize(word), lemmatizer) return ' '.join(tmp)
def lemmatize_content(text): tmp = [] for word in text.split(): tmp.append(lemmatize(word)) return ' '.join(tmp)
import preprocess import pandas as pd filename = 'dataset_related.csv' output = 'data_related_extracted/data_related_extracted_remstop.txt' df = pd.read_csv(filename) df_extract = df.loc[:, ['tweet', 'class']] df_extract = df_extract.dropna() df_extract = df_extract.drop_duplicates() tweets = df_extract.tweet.values.tolist() classes = df_extract['class'].values.tolist() # Preprocess for i in range(len(tweets)): tweets[i] = tweets[i].replace('\n', ' ') tweets[i] = preprocess.preprocess(tweets[i]) tweets[i] = preprocess.remove_punc(tweets[i]) tweets[i] = preprocess.lemmatize(tweets[i]) tweets[i] = preprocess.remove_stopwords(tweets[i]) s = [] for tweet, cl in zip(tweets, classes): s.append(tweet + '\t' + str(int(cl)) + '\n') with open(output, 'wb') as f: for x in s: f.write(x.encode('utf-8'))
def tokenizer(doc): doc = preprocess.clean_text(doc) return preprocess.lemmatize(doc)
from scraper import * import preprocess from sys import argv import yaml import index # Main script if __name__ == '__main__': if "scrape" in argv: with open('sites.yaml', 'rb') as file: pages = yaml.load(file) Scraper(pages).downloadarticlesforeverywebsite() if 'lemmatize' in argv: preprocess.lemmatize() if 'index' in argv: index.index()