def load_all_query_annotated_robust4( file='/local/karmim/Stage_M1_RI/data/topics-title.annotated.csv', pre_process=True, CUSTOM_FILTERS=[lambda x: x.lower(), remove_stopwords], delete_meaning=True): query_an = {} # Dict with words and concept for a query id concept = {} # Dict with only the concept for a query id f = codecs.open(file, 'r', encoding='utf-8', errors='ignore') for line in f: #print(line.split()) line = np.array(line.split()) index = np.where(np.char.find(line, '$#') >= 0) concept[line[0]] = list(line[index]) query_an[line[0]] = list(line[1:]) if delete_meaning: concept[line[0]] = [w[:-5] for w in concept[line[0]]] query_an[line[0]] = [ w[:-5] if '$#' in w else w for w in query_an[line[0]] ] if pre_process: for k in query_an: query_an[k] = preprocess_string(' '.join(query_an[k]), CUSTOM_FILTERS) concept[k] = preprocess_string(' '.join(concept[k]), [lambda x: x.lower()]) return query_an, concept
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora latin1 = partial(utils.to_unicode, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [ preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f ] with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [ preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f ] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)]
def parse_and_load_discussion_questions(course_data_path, conn, course_zip_name): """load, parse, process discussion questions """ course_slug = course_zip_name.replace("_", "-") sql_select_discussion_question = ( "SELECT discussion_question_id, discussion_question_title, " + "discussion_question_details " + "FROM discussion_questions, courses WHERE " + "discussion_questions.course_id == courses.course_id AND " + "courses.course_slug == (?)") c = conn.cursor() c.execute(sql_select_discussion_question, (course_slug, )) course_questions = {} rows = c.fetchmany() while rows: for row in rows: question_id, question_title, question_details = row course_questions[question_id] = ( preprocess_string(question_title) + preprocess_string(question_details)) rows = c.fetchmany() # save the course_questions to disk questions_filepath = os.path.join(course_data_path, "..", "questions.{}.json".format(course_slug)) with open(questions_filepath, "w") as questions_file: json.dump(course_questions, questions_file)
def main(args): doc_dir = args.doc_dir docs = defaultdict(list) for file in os.listdir(doc_dir): title, abstract = parse_xml(os.path.join(doc_dir, file)) key = int(re.search(r'\d+', file).group()) title = preprocess_string(title) abstract = preprocess_string(abstract) docs[key] = [title, abstract] train_word2vec(docs, args.model_file)
def preprocess(s, stem=True): ''' given a document or query string, returns a list of preprocessed words. we can decide whether to stem each word or not. ''' if not stem: preprocess_filters = DEFAULT_FILTERS.copy() preprocess_filters.pop() # remove stemming from list of filters wordList = preprocess_string(s, filters=preprocess_filters) else: wordList = preprocess_string(s) for i in range(len(wordList)): wordList[i] = deaccent(wordList[i]) return wordList
def preprocess_column(self, pd_data, load_model=False): """ Preprocess specified column. Inputs: pd_data: (pd.Series) Input data to preprocess. Returns: pd_data: (pd.Series) Preprocess data. """ # preprocess using set of filters custom_filters = self._build_custom_filter_list() log.info('Applying preprocess filters to the %s...', pd_data.name) pd_data = pd_data.apply( lambda x: gpp.preprocess_string(x, custom_filters), convert_dtype=False) # generate phrase based on the configuration pd_data = self._generate_phrase(pd_data, load_model=load_model) # join the list of words into space delimited string pd_data = pd_data.apply(lambda x: ' '.join(x)) return pd_data
def _tokenize(document: dict, phraser=None): text_information = [value for key, value in document.items()] text = " ".join(text_information) def _custom_strip_short(s): return strip_short(s, minsize=2) def _custom_strip_numeric(s): RE_NUMERIC = re.compile(r' [0-9]+( [0-9]+)*(\.)? ', re.UNICODE) s = utils.to_unicode(s) return RE_NUMERIC.sub(" ", s) # most of the preprocessing is done already # strip_tags removes style definitions etc as well which is good CUSTOM_FILTERS = [strip_tags, _custom_strip_short, _custom_strip_numeric] preprocessed_text = preprocess_string(text, CUSTOM_FILTERS) if phraser: tokens = phraser.phrase(preprocessed_text) return [token.replace(' ', '_') for token in tokens] else: return preprocessed_text
def find_nearest_words(model, work): # tags = parse_tags() # tax = parse_taxonomy() # ids = tags[work]['ids'] # ts = [tax[d] for d in ids if 'stemning' in tax[d]] text = MAPPER.get_text(work) # print(text) # print(ts) M, labels = infer_taxonomy_vectors(model, taxonomy_path=None) max_dists = defaultdict(lambda: 0) for token in pre.preprocess_string(text, filters=FILTERS): if token in model.wv: v = model.wv[token].reshape(1, -1) dists = cosine_similarity(M, v) am = np.argmax(dists) max_dists[labels[am]] = max(max_dists[labels[am]], np.max(dists)) md = sorted([(k, d) for k, d in max_dists.items()], key=lambda x: x[1]) i = 0 rtags = [] for k, v in md: if i > 10: break if 'stemning::' in k and v > 0.5: i += 1 rtags.append(k) # print(k, v) return rtags
def custom_tokenizer(s): return [ w.translate(table) for w in preprocess_string(s, [ strip_tags, lambda x: strip_short(x, 2), remove_stopwords, lambda x: ks.stem(x) ]) ]
def remove(self, text): """[summary] Args: text ([type]): [description] Returns: [type]: [description] """ # if text: CUSTOM_FILTERS = [ lambda x: x.lower(), # lowercase strip_multiple_whitespaces, strip_non_alphanum, strip_numeric, remove_stopwords, strip_short, # stem_text ] text = text.lower() example_sent = preprocess_string(text, CUSTOM_FILTERS) filtered_sentence = [ w for w in example_sent if not w in self.get_stopwords() ] return filtered_sentence
def litsiden(): f = pr.resource_filename('matext', 'data/litteratursiden_pidmap.json') mapper = LitteraturSidenMapper() works = [] df = {} lk_frac = None with open(f) as fh: data = json.load(fh) lk_frac = [ len(data['train']) + len(data['dev']), len(data['train']) + len(data['dev']) + len(data['corpus']) ] for k in data.keys(): works += data[k] for w in works: t = mapper.get_text(w) tokens = pre.preprocess_string(t, filters=FILTERS) df[w] = [len(tokens), len(t)] df = pd.DataFrame.from_dict(df, orient='index') df.columns = ['tokens', 'char-length'] print(df.describe()) fig, ax = plt.subplots() sns.distplot(df['char-length'], ax=ax, bins=100, kde=False, norm_hist=False) fig.savefig('lit_hist.png') print('LK', lk_frac)
def preprocess(datafolder): docs = {} nlp = load('en') for file in os.listdir( datafolder): #going through all the files in the folder filepath = os.path.join(datafolder, file) if not file.startswith('.'): document = loadDoc(filepath) sentenceSplit = list(nlp(document).sents) gensimSettings = [ lambda x: x.lower(), genPreProc.remove_stopwords, genPreProc. stem, #making the text uniform and removing stopwords genPreProc.strip_non_alphanum, genPreProc.strip_multiple_whitespaces ] sentencePreprocess = [ ' '.join( preprocess_string(str(sentence), filters=gensimSettings)) for sentence in sentenceSplit ] docs[os.path.basename(filepath)] = sentencePreprocess return docs
def process_input(row): input_merged = row['Assignment Name'] + ' ' + row['School Category'] # gensim's preprocess_string through series of txt_filters which generates tokens array input_processed_tokens = " ".join(preprocess_string(input_merged, txt_filters)) return input_processed_tokens
def remove_text_duplicates_retain_order(texts): """ Remove the duplicates from a list of text strings, where duplicates are defined as two text strings that differ only by puncutation, capitalization, or the length of whitespaces. This is useful for not retaining extra text information just because its not perfectly identical to some existing string. Duplicates are removed such that the first occurence is retained, and that determines the final ordering. The texts that are returned are not processed, and are a subset of the original list of text strings. The strings retained determined which version of that duplicate in terms of punctuation, capitalization, and whitespace is retained in the final list. Args: texts (list of str): A list of arbitrary strings. Returns: list of str: A subset of the original list, with duplicates as defined above removed. """ # Create a list of cleaned texts that corresponds to the list of texts passed in. filters = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces] cleaned_texts = [" ".join(preprocess_string(text, filters)) for text in texts] assert len(texts) == len(cleaned_texts) # Get a dictionary mapping the cleaned texts to the a list of the original texts that they resulted from. cleaned_to_originals = defaultdict(list) for cleaned_text,text in zip(cleaned_texts,texts): cleaned_to_originals[cleaned_text].append(text) # Remove duplicates and retain the order of the list of the cleaned texts. cleaned_texts_no_duplicates = remove_duplicates_retain_order(cleaned_texts) # Using whatever the first observed instance of original text that resulting in each cleaned text, rebuild the list. original_texts_with_same_removals = [cleaned_to_originals[cleaned_text][0] for cleaned_text in cleaned_texts_no_duplicates] return(original_texts_with_same_removals)
def transform(self, df_x): return np.asmatrix( np.array([ self._model.infer_vector( preprocess_string(row['reviews_content'])) for index, row in df_x.iterrows() ]))
def read_documents(path): dataset = [] filter = [ lambda x: x.lower(), strip_multiple_whitespaces, strip_numeric, strip_non_alphanum, strip_punctuation, remove_stopwords, strip_tags, lambda s: strip_short(s, minsize=4), ] LEN_THRESHOLD = 10 for root, dirs, files in os.walk(path): if os.path.basename(root) in ["Cog", "NotCog"]: print(root) for f in files: with open(os.path.join(root, f), "r") as myfile: text = myfile.read() text = re.sub(r"[^\x00-\x7F]+", " ", text) res = [] doc = nlp(text) for sent in doc.sents: sent = " ".join([word.lemma_ for word in sent]) res.append(" ".join( preprocess_string(sent, filters=filter))) text = "\n".join(res) label = 0 if os.path.basename(root) == "Cog" else 1 if len(text) < LEN_THRESHOLD: print(f) continue dataset.append((text, label)) random.shuffle(dataset) texts, labels = zip(*dataset) return texts, labels
def cleaner(path, d): with codecs.open(path, encoding='utf8') as f: text = f.read() text1 = unidecode.unidecode(text) # remove roman numerals text1 = removeRomanNumerals(text1) # strip numbers, whitespace, and punctuation EMBEDDING_FILTERS = [ lambda x: x.lower(), strip_numeric, strip_multiple_whitespaces, strip_punctuation ] c = preprocess_string(text1, EMBEDDING_FILTERS) # replace and correct words c = replaceWordsFromMap(c, correctionDict) c = replaceWordsFromMap(c, syncopateDict) c = replaceWordsFromMap(c, variantDict) c = replaceWordsFromMap(c, variantDict2) d = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in c] t = " ".join(d) tokens = parallelRemove(d, 4) return tokens, t
def flat_doc(document, model, extremes=None): flat_doc = "" for field in document: if not isinstance(document[field], list): continue #No tomamos en cuenta los campos 'id' y '_version_': auto-generados por Solr for value in document[field]: ## Detección y traducción ## if field=='author.authors.authorName' or field=='author.authorBio' or field=='description' or field=='quotes.quoteText': value_blob = TextBlob(value) try: if value_blob.detect_language() != 'en': try: value = value_blob.translate(to='en') except Exception as e: value = value #e = NotTranslated('Translation API returned the input string unchanged.',) except Exception as e: value = value #e = TranslatorError('Must provide a string with at least 3 characters.') ############################ flat_doc += str(value)+' ' #Se aplana el documento en un solo string flat_doc = preprocess_string(flat_doc, CUSTOM_FILTERS) #Preprocesa el string flat_doc = [w for w in flat_doc if w not in stop_words] #Remueve stop words if extremes: flat_doc = [w for w in flat_doc if w not in extremes] flat_doc = [w for w in flat_doc if w in model.vocab] #Deja sólo palabras del vocabulario if flat_doc == []: flat_doc = ['book'] #Si el libro queda vacío, agregarle un token para no tener problemas más adelante return flat_doc
def tokenize(self, text): """Tokenizes the provided text Args: text (str): The text to be tokenized Returns: list(tuple(str, int)): A list of (token, count) pairs from the text without the stopwords. """ # make everything lowercase and strip punctuation CUSTOM_FILTERS = [lambda x: x.lower(), strip_punctuation] tokens = preprocess_string(text, CUSTOM_FILTERS) # filter out all stopwords filtered_tokens = [w for w in tokens if not w in self.__stopwords] # count the term frequency in the text count = { } for word in filtered_tokens: if word not in count: count[word] = 0 count[word] += 1 # sort the terms in descending order terms_sorted = sorted(count.items(), key=operator.itemgetter(1), reverse=True) return terms_sorted
def create_bigrams_and_remove_Stopwords(raw_transcripts_gensim): #creating phrases like good_afternoon, hdfc_life so that they can be removed as part of custom stop words. Preprocessed_Transcripts = [] bigrams = [] import gensim, pprint for transcripts in raw_transcripts_gensim["Lemmatized_transcript"]: tokens = [list(gensim.utils.tokenize(transcripts, lower=True))] bigram_mdl = gensim.models.phrases.Phrases(tokens, min_count=1, threshold=5) #Preprocessed_Transcripts.append(token_bigrams(transcripts)) from gensim.parsing.preprocessing import preprocess_string, remove_stopwords CUSTOM_FILTERS = [remove_stopwords] #if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 tokens = [ preprocess_string(" ".join(word), CUSTOM_FILTERS) for word in tokens ] bigrams = bigram_mdl[tokens] Preprocessed_Transcripts.append(list(bigrams)) #final step of preprocessing to remove stop words Final_Preprocessed_Transcripts = [] for tokenised_text in Preprocessed_Transcripts: #print(tokenised_text) for token in tokenised_text: Final_Preprocessed_Transcripts.append(preprocess(token)) #Appending the final preprocessed transcript to the dataframe raw_transcripts_gensim[ "Preprocessed Transcripts"] = Final_Preprocessed_Transcripts return raw_transcripts_gensim
def getExistingWordsFromModel(words): """ Checks if a list of words are in the dictionary of the word2vec model """ CUSTOM_FILTERS = [lambda x: strip_numeric, remove_stopwords] res = [] for w in words: try: vec = word_vectors[w] res.append(w) except: try: w_transformed = w.replace(".", "").replace("=", "").replace( "-", "").replace("*", "").replace("'", "").replace( "`", "").replace("|", "").replace('\\', "").replace( "/", "").replace("$", "").replace("^", "").replace( "&", "").replace("@", "").replace("%", "") vec = word_vectors[w_transformed] res.append(w_transformed) except: try: w_stripped = preprocess_string(w_transformed, CUSTOM_FILTERS) vec = word_vectors[w_stripped] res.append(w_stripped) except: continue return res
def __iter__(self): with open(self.filename) as file: for line in file: line = preprocess_string(line, [ lambda x: x.lower(), strip_tags, strip_multiple_whitespaces ]) yield ['<SOS>', *line, '<EOS>']
def cleanlines2(text): '''Clean text removing urls, punctation, numbers, whitespace and convert to lowecase''' text1 = str(text).lower() lines = [] #split in lines for line in text1.split('\n'): line = str(line) line = line.strip('\n') if line: lines.append(line) cleantext = '' for line in lines: filterreg = config.LABELREGEX.search(line) if filterreg is None: cleantext = cleantext + line #+ '\n' else: if filterreg.group(): pass else: cleantext = cleantext + line #+ '\n' cleantext = str(cleantext) text1 = re.sub('\\S*@\\S*\\s?', '', cleantext) # Remove Emails text1 = re.sub("\'", "", text1) #remove single quotes text1 = re.sub('\\s+', ' ', text1) #remove new line character text1 = re.sub(r'http\S+', '', text1) #remove URLs text1 = tokenize(str(text1)) text1 = str(text1) #using gensim to remove numbers, punctuation, whitespace, stopwords, #non-alfa, convert lowercase and stem text1 = ' '.join(preprocess_string(str(text1))) return text1
def clean_texts(texts: list) -> list: clean_texts = [] for text in texts: processed_texts = preprocess_string(text, CUSTOM_FILTERS) processed_texts = [w for w in processed_texts if not w in STOP_WORDS] clean_texts.append(processed_texts) return clean_texts
def output_csv_files(output_dfs, output_df_strs): assert (len(output_dfs) == len(output_df_strs)) sia = SIA() for index in range(len(output_dfs)): preprocsplit = (lambda rev: preprocess_string(str(rev))) output_df, stem_name = output_dfs[index], output_df_strs[index] output_df[2] = (output_df[1].astype(str).apply(preprocsplit)) new_output_df = pd.DataFrame() #new output dataframe... for (_, row) in output_df.iterrows(): if (('unprofession' in list(row[2]) or 'profession' in list(row[2]))): new_output_df = new_output_df.append(row, ignore_index=True) misclassified_df = pd.DataFrame() for (_, row) in new_output_df.iterrows(): if (('unprofession' in list(row[2]) and sia.polarity_scores(row[0])['compound'] > 0.0) or ('profession' in list(row[2]) and sia.polarity_scores(row[0])['compound'] < 0.0)): misclassified_df = misclassified_df.append(row, ignore_index=True) f_name = 'misclassified_' + stem_name + '.csv' print(type(misclassified_df)) print(f_name) misclassified_df.to_csv(f_name) return
def trainD2V(fileName): with open(fileName, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter=';') corpus = [] for row in reader: #print(row[5]) if row[5] and len(row[5]) > 10: pp_news = pp.preprocess_string(row[5], CUSTOM_FILTERS) # if (len(pp_news) > 2): corpus.append(pp_news) tagged_documents = [] for i, doc in enumerate(corpus): tagged = TaggedDocument(doc, [i]) tagged_documents.append(tagged) dv = Doc2Vec(tagged_documents, vector_size=100, window=3, min_count=10, workers=4, epochs=100) dv.train(tagged_documents, total_examples=dv.corpus_count, epochs=dv.epochs) return dv
def chatcode(): global name; print('\n\nHello! Thanks for coming here. I am a chatbot. People say that ' 'I am a kind and approachable bot.') name = input('Please tell me your name.\n') try: preprocessed = [word for word in preprocess_string(name) if word not in ( 'people', 'call', 'friend')][0] name = [word for word in strip_non_alphanum(name.lower()).split( ) if preprocessed in word][0] except: name = name.split()[0] name = name[0].upper() + name[1:] print("Hi " + name + "! My name's CAFE BUDDY. Let's start with our session.") response = input("How are you doing?\n") if (predict(response) >= 0.55): response = input('That is good. Are you usually this happy, or are there '\ 'some worries that you want to talk about?\n') if (predict(response)>=0.7): response = input('You seem to be really content. Wanna sign off?\n') if(predict(response)>=0.7): print('Ok, bye ' + name + '!') else: response = input('Is there something bothering you? Would you '\ 'share it with me?\n') if(predict(response)>=0.7): print("That's okay. It was nice talking to you. You can chat "\ "with me anytime you want.\n Bye" + name + "!") else: sad1() else: sad1() else: sad3()
def trainW2V(fileName): with open(fileName, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter=';') corpus = [] for row in reader: #print(row[5]) try: if index_in_list(row, 5) and len(row[5]) > 10: pp_news = pp.preprocess_string(row[5], CUSTOM_FILTERS) # if (len(pp_news) > 2): corpus.append(pp_news) except: print("skipped: " + row[0]) EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz' google_model = Word2Vec(size=300, window=5, min_count=2, workers=-1) google_model.build_vocab(corpus) google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True) google_model.train(corpus, total_examples=google_model.corpus_count, epochs=5) return google_model.wv.wv
def getNewsRecommendationDoc2Vec(fileName, email, preference, fileRatings, alreadyLiked): userNews = [ ] # Contiene una lista di news per un utente con [email, link news, cosine] documents = [] links = [] count = 0 query = calculate_centroid(preference, 3, dv) with open(fileName, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter=';') for row in reader: if index_in_list(row, 5) and len(row[5]) > 10: news = [ ] # creo lista vuota che ocnterrà le info dell'utente per una determinata news pp_news = pp.preprocess_string( row[5], CUSTOM_FILTERS ) # Faccio il pre-processing della descrizione della notizia if len(pp_news) > 2: # Calcolo il centroide per ogni news newsVector = calculate_centroid(pp_news, 3, dv) try: # Calcolo la cosine similarity tra il centroide della news ed il centroide della preferenza dell'utente cos_sim = 1 - spatial.distance.cosine( query, newsVector) except: cos_sim = 0 # Info sulla news news.append(email) # email news.append(row[1]) # link news.append(cos_sim) # cosine similarity # Inserisco la news in una lista di news userNews.append(news) file.close() # Ordino in ordine decrescente la cosine similarity userNews.sort(key=itemgetter(2), reverse=True) with io.open(fileRatings, "a", encoding="utf-8") as myfile: i = 0 for news in userNews: if i > 4: break if not (news[1] in alreadyLiked): myfile.write(news[0] + ";") # email myfile.write(news[1] + ";") # link myfile.write('{} \n'.format(news[2])) # cosine similarity # print(news[1]) # descrizione pre-processata i = i + 1 print('Scrittura in ' + fileRatings + ' avvenuta per ' + email + '!') myfile.close() return ''
def load_data(fname): print 'input file name:', fname target = [] #ラベル source = [] #文書ベクトル #文書リストを作成 document_list = [] word_list = [] for l in open(fname, 'r').readlines(): sample = l.strip().split(' ', 1) label = sample[0] target.append([label]) #ラベル word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング document_list.append(word_list) #文書ごとの単語リスト #辞書を作成 #低頻度と高頻度のワードは除く dct = Dictionary(document_list) dct.filter_extremes(no_below=3, no_above=0.6) #文書のBOWでベクトル化 for doc in document_list: tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0]) source.append(dense) dataset = {} dataset['target'] = np.array(target) dataset['source'] = np.array(source) return dataset #, max_len, width
def parse_and_load_discussion_answers(course_data_path, conn, course_zip_name): """load, parse, process discussion answers """ course_slug = course_zip_name.replace("_", "-") sql_select_discussion_answer = ( "SELECT discussion_answer_id, discussion_answer_content " + "FROM discussion_answers, courses WHERE " + "discussion_answers.course_id == courses.course_id AND " + "courses.course_slug == (?)") c = conn.cursor() c.execute(sql_select_discussion_answer, (course_slug, )) course_answers = {} rows = c.fetchmany() while rows: for row in rows: answer_id, answer_content = row course_answers[answer_id] = preprocess_string(answer_content) rows = c.fetchmany() # save the course_answers to disk answers_filepath = os.path.join(course_data_path, "..", "answers.{}.json".format(course_slug)) with open(answers_filepath, "w") as answers_file: json.dump(course_answers, answers_file)
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora with open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(f) with open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(f) with open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [preprocess_string(s, filters=DEFAULT_FILTERS[:-1]) for s in f] with open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [preprocess_string(s, filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # initializations articles = {} all_missing = [] redir_on = {} collisions = {} non_ascii = [] site = mwclient.Site('en.wikipedia.org', '/w/api.php/') # get all txt files in a folder and iterate over them filelist = glob.glob(os.path.join(base_path, p['folder_path'], "*.txt")) for f in filelist: # get the word we are working on f_name = os.path.basename(f) k_word = os.path.splitext(f_name)[0] logger.info("working on file: %s" % f_name) # try to convert the word into ascii for the http query file_obj = codecs.open(f, "r", "utf-16") counter = 0 words = [] for w in file_obj.readlines(): try: s = w.strip().decode('ascii') words.append(s) except Exception: counter += 1 non_ascii.append(w.strip()) logger.info("\t%d words containing non ascii are ommited" % counter) articles[k_word] = {} logger.info("\tfound %d words in file" % len(words)) for word in words: data = {} page = site.Pages[word] # follow the redirect and check for collisions if page.redirect: res = re.search('\[\[(.+)\]\]', page.edit()) redir_word = urllib.unquote(res.groups()[0]) if redir_word in redir_on: logger.warning("[%s AND %s] both redirect on --> %s" % (word, redir_on[redir_word], redir_word)) collisions[redir_word] = redir_on[redir_word] else: logger.info("[%s] redir from [%s]" % (redir_word, word)) redir_on[redir_word] = word text = site.Pages[redir_word].edit() data['redirected'] = redir_word else: text = page.edit() # check for missing wikipedia articles if text == "": all_missing.append(word) continue # preprocess the received article data['text'] = wikicorpus.filter_wiki(text) in_ascii = ud.normalize('NFKD', data['text']).encode('ascii', 'ignore') data['text'] = preprocess_string(in_ascii) articles[k_word][word] = data logger.info('add human rating to the articles') id_word = {} sparql_path = os.path.join(base_path, p['sparql_path']) with open(os.path.join(sparql_path, 'id_word.txt')) as f: for line in f.readlines(): idx, word = line.strip().split('\t') id_word[idx] = word #add human rating to the wikipedia data not_found = [] with open(os.path.join(sparql_path, p['human_file'])) as f: for line in f.readlines(): arr = line.split() word = id_word[arr[0]] term = arr[3] try: articles[word][term]['rating'] = int(arr[4]) except KeyError: not_found.append(term) logger.info("%d words from the ref queries not found" % len(not_found)) f = open(os.path.join(output_dir, "articles.pickle"), 'wb') pickle.dump(articles, f) f.close info = {} info['missing'] = all_missing info['redirs'] = redir_on info['collisions'] = collisions info['not_found'] = not_found info['non_ascii'] = non_ascii f = open(os.path.join(output_dir, "info.pickle"), 'wb') pickle.dump(info, f) f.close logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
# download the content of the article # some redirects introduce no ascii characters # TODO introduce a proper conversion of this characters try: title = title.decode('ascii') except Exception: continue query = (query_base + "&export") % title text = myopener.open(query).read() soup = BSS(text, convertEntities=BSS.ALL_ENTITIES) export = BSS(soup.api.query.export.prettify()) text = BSS(export.mediawiki.page.revision.prettify()) if text.revision.minor: data['text'] = wikicorpus.filterWiki(text.revision.minor.text) else: data['text'] = wikicorpus.filterWiki(text.revision.text) in_ascii = unicodedata.normalize('NFKD', data['text']).encode('ascii', 'ignore') data['text'] = preprocess_string(in_ascii) articles[k_word][title] = data f = open(results_path + "sparql_wiki.pickle", 'wb') pickle.dump(articles, f) f.close print sum(all_missing, [])