def read_documents(top_directory, tokens_only=False, remove_stop_words=False, stop_words=None, speller=None, stemmer=None): global file_list print('Reading validation documents... ', end='') sys.stdout.flush() result_list = [] if remove_stop_words and not stop_words: stop_words = get_stop_words('de') for root, dirs, files in os.walk(top_directory): counter = 0 files.sort() for filename in filter(lambda filename: filename.endswith('.json'), files): counter += 1 filename = os.path.join(root, filename) mail = json.load(codecs.open(filename, 'r', 'utf-8')) body = mail['body'] title = mail.get('subject', '') fContents = [] if title != '': fContents.append(title) fContents.extend(body) token_list = [] for line in fContents: line = line.strip().lower() tokens = simple_tokenize(line) if remove_stop_words is True: stopped_tokens = [i for i in tokens if not i in stop_words] else: stopped_tokens = tokens stopped_tokens = [ token for token in stopped_tokens if len(token) > 1 ] if speller != None: stopped_tokens = speller.getSpellCheckedWordList( stopped_tokens) if stemmer != None: stopped_tokens = stemmer.stemWordList(stopped_tokens) token_list.extend(stopped_tokens) file_list.append(filename) if tokens_only: result_list.append(token_list) else: result_list.append( gensim.models.doc2vec.TaggedDocument( token_list, [counter])) print('done') return result_list
def getSpellCheckedString(self, text): words = simple_tokenize(text) spell_checked_text = '' for word in words: correct_word = word if self.checkWordSpelling(word) == False: suggested_word = self.suggestWord(word) if suggested_word != None and len(suggested_word) > 0: correct_word = suggested_word spell_checked_text = spell_checked_text + correct_word + ' ' spell_checked_text = spell_checked_text.strip() return spell_checked_text
def read_all_sentences(top_directory, remove_stop_words=False, stop_words=None, speller=None, stemmer=None): sentences = [] errfile = codecs.open('/tmp/dict-errs.txt', 'w', 'utf-8') print('Reading & processing source data... ') sys.stdout.flush() if remove_stop_words and not stop_words: stop_words = get_stop_words('de') for root, dirs, files in os.walk(top_directory): counter = 0 files.sort() for filename in filter(lambda filename: filename.endswith('.txt'), files): counter += 1 filename = os.path.join(root, filename) fContents = codecs.open(filename, 'r', 'utf-8').readlines() lineC = len(fContents) currL = 0 for line in fContents: if currL % 50 == 0: print('{:-7d}/{:d} Lines processed\r'.format(currL, lineC), end='') sys.stdout.flush() currL += 1 line = unicode(line.strip().lower()) tokens = simple_tokenize(line) if remove_stop_words is True: stopped_tokens = [i for i in tokens if not i in stop_words] else: stopped_tokens = tokens stopped_tokens = [ token for token in stopped_tokens if len(token) > 1 ] if speller != None: stopped_tokens = speller.getSpellCheckedWordList( stopped_tokens) if stemmer != None: stopped_tokens = stemmer.stemWordList(stopped_tokens) if len(stopped_tokens): sentences.append(stopped_tokens) errfile.close() print('\ndone') return sentences
def read_documents(top_directory, remove_stop_words=False, stop_words=None, speller=None, stemmer=None): topics = {} if remove_stop_words and not stop_words: stop_words = get_stop_words('de') for root, dirs, files in os.walk(top_directory): counter = 0 for dir in dirs: topics[dir] = [] for filename in filter(lambda filename: filename.endswith('.json'), files): counter += 1 filename = os.path.join(root, filename) mail = json.load(codecs.open(filename, 'r', 'utf-8')) body = mail['body'] title = mail.get('subject', '') fContents = [] if title != '': fContents.append(title) fContents.extend(body) token_list = [] for line in fContents: line = line.strip().lower() tokens = simple_tokenize(line) if remove_stop_words is True: stopped_tokens = [i for i in tokens if not i in stop_words] else: stopped_tokens = tokens stopped_tokens = [token for token in stopped_tokens if len(token)>1] if speller != None: stopped_tokens = speller.getSpellCheckedWordList(stopped_tokens) if stemmer != None: stopped_tokens = stemmer.stemWordList(stopped_tokens) token_list.extend(stopped_tokens) file_topic = os.path.basename(os.path.dirname(filename)) if file_topic in topics.keys(): topics[file_topic].append(token_list) return topics
def addWordsFromText(self, text): words = simple_tokenize(text) if len(words) > 0: for word in words: self.speller.add(word)
def analyze_directory(indir, ground_truth_dir, w2v_file, outfile, verbose, outdir, numthreads, lang, remove_stop_words, speller=None, stemmer=None): try: w2v_model = gensim.models.Word2Vec.load(w2v_file) except: print('Could not read W2V-File ({})'.format(w2v_file)) sys.exit(1) stop_words = None if remove_stop_words: stop_words = get_stop_words(lang) topic2docid = [] topics = read_train_docs(ground_truth_dir, remove_stop_words, stop_words, speller, stemmer) train_corpus = [] for key in topics.keys(): documents = topics[key] for document in documents: topic2docid.append(key) document_tokens = simple_tokenize(document) train_corpus.append(document_tokens) print('Generating WMD instances... ', end='') sys.stdout.flush() wmd_instances = [] for i in range(numthreads): wmd_instance = WmdSimilarity(train_corpus, w2v_model, min(len(train_corpus), 3)) wmd_instances.append(wmd_instance) print('done') validate_corpus = read_documents(indir, True, remove_stop_words, stop_words, speller, stemmer) chunk_size = int(len(validate_corpus) / numthreads) result_list = {} queues = [] processes = [] results = [] for instance in range(numthreads): in_q = Queue() out_q = Queue() queues.append([in_q, out_q]) if instance == numthreads - 1: sp = Process(target=perform_query_on_corpus, args=(wmd_instances[instance], validate_corpus[instance * chunk_size:], instance, len(validate_corpus), numthreads, in_q, out_q)) else: sp = Process( target=perform_query_on_corpus, args=(wmd_instances[instance], validate_corpus[instance * chunk_size:(instance + 1) * chunk_size], instance, len(validate_corpus), numthreads, in_q, out_q)) processes.append(sp) for sp in processes: sp.start() for i, sp in enumerate(processes): q = queues[i] in_q = q[0] out_q = q[1] vc_index = out_q.get() sims_array = out_q.get() in_q.put('THANKS') in_q.close() sp.join() if sims_array != None and len(sims_array) > 0: res = [] for l_docId, sims in enumerate(sims_array): docId = (vc_index * chunk_size) + l_docId if verbose: print_sim(validate_corpus[docId], sims, train_corpus, topic2docid) if sims: result_list[file_list[docId]] = { 'sim': sims[0][1], 'type': topic2docid[sims[0][0]] } print('done analyzing.') if isinstance(outfile, basestring): json_out_f = codecs.open(outfile, 'w', 'utf-8') else: json_out_f = outfile print(json.dumps(result_list, indent=4), file=json_out_f) if isinstance(outfile, basestring): json_out_f.close() if outdir is not None: print('Copying files into subdirectories of {} ... '.format(outdir), end='') sys.stdout.flush() for key in topics.keys(): if '/' in key: key = key.replace('/', '_') if ':' in key: key = key.replace(':', '_') path = os.path.join(outdir, key) if not os.path.exists(path): os.makedirs(path) for source_file in result_list.keys(): dest_type = result_list[source_file]['type'] dest_path = os.path.join(outdir, dest_type) dest_file = os.path.join(dest_path, os.path.basename(source_file)) shutil.copyfile(source_file, dest_file) print('done')