import time, hashlib, refrom collections import defaultdictfrom Utils.string_utils import collapse_spacesfrom Utils.file_utils import find_files, load_stop_words def compute_ngrams(tokens, max_len = None, min_len = 1): """ tokens : iterable of string a single sentence of tokens. Assumes start and stop tokens omitted max_len : int maximum ngram length min_len : int minimum ngram length """ if max_len == None: max_len = len(tokens) if min_len > max_len: raise Exception("min_len cannot be more than max_len") ngrams = set() # unigrams for ngram_size in range(min_len, max_len + 1): for start in range(0, len(tokens) - ngram_size + 1): end = start + ngram_size -1 words = [] for i in range(start, end + 1): words.append(tokens[i]) ngrams.add(tuple(words)) # make a tuple so hashable return ngrams # we may want to keep some non-alpha characters, such as # in C# and + in C++, etc.re1 = re.compile("[;:\'\"\*/\),\(\-\|\s]+")def remove_punct(s): s = s.replace("'s"," ") return collapse_spaces(re1.sub(" ",s).strip()) def hash_string(s): hash_object = hashlib.md5(b'%s' % s) return str(hash_object.hexdigest()) # recursive algorithm to eliminate shorter phrases with same or similar Doc Freq as longer phrasesdef find_sub_phrases_to_remove(tpl_phrase, valid_phrases, doc_freq, to_rem): if len(tpl_phrase) <= 1: return phrase_df = doc_freq[tpl_phrase] ngrams = compute_ngrams(tpl_phrase, len(tpl_phrase)-1, 1) for tpl_ngram in ngrams: if tpl_ngram in valid_phrases and tpl_ngram not in to_rem: sub_phr_df = doc_freq[tpl_ngram] # if sub_phrase_df is close to the same frequency if phrase_df >= (0.9 * sub_phr_df): to_rem.add(tpl_ngram) #to_rem_dbg.add((tpl_phrase, tpl_ngram, phrase_df, sub_phr_df)) find_sub_phrases_to_remove(tpl_ngram, valid_phrases, doc_freq, to_rem) """ Extract Phrases """import sysfrom Config.extract_keywords_config import ExtractKeywordsConfig if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") #sys.argv[0] is this script file, sys.argv[1] should be the config fileconfig = ExtractKeywordsConfig(sys.argv[1])script_start = time.time() if config.stop_words_file: stop_words = load_stop_words(config.stop_words_file) print("%i stop words loaded" % len(stop_words))else: stop_words = set() """ Load Documents """start = time.time()files = find_files(config.processed_documents_folder, config.file_mask, True)print("%s files found in %s" % (len(files), config.processed_documents_folder))documents = []for i, fname in enumerate(files): with open(fname) as f: contents = f.read() documents.append(contents.split("\n"))end = time.time()print("Loading %i documents took %s seconds" % (len(files), str(end - start))) """ Extract Common Terms and Phrases """start = time.time()#Or use a counter here.doc_freq = defaultdict(int) # remove short docstokenized_docs = []sent_id = 0sent_ids = set()lens = []hashed = set() """ Find single word keywords """for doc in documents: un_tokens = set() tok_sents = [] for sent in doc: cl_sent = remove_punct(sent.lower()) hash_sent = hash_string(cl_sent) # remove dupe sentences (not - will hurt df accuracy a little) if hash_sent in hashed: continue hashed.add(hash_sent) tokens = tuple(cl_sent.split(" ")) lens.append(len(tokens)) sent_id += 1 tok_sents.append((sent_id, tokens)) sent_ids.add(sent_id) # create inverted index and unique tokens (for doc freq calc) proc_tokens = set() for tok in tokens: if not tok in proc_tokens: proc_tokens.add(tok) if not tok in un_tokens: un_tokens.add(tok) doc_freq[tok] += 1 if len(tok_sents) > 0: tokenized_docs.append(tok_sents) end = time.time()print("Extracting Keywords from %i documents took %i secs" % (len(tokenized_docs), end-start)) # Get really frequent items for removalnum_docs = float(len(tokenized_docs))above_threshold = [k for k,v in doc_freq.items() if v >= config.min_document_frequency] # remove really frequent terms (in 50% or more of documents)too_freq = set([k for k in above_threshold if (doc_freq[k]/num_docs) >= config.max_proportion_documents]) freq_terms = [k for k in above_threshold if k not in stop_words and k not in too_freq]print("%s frequent terms identified for building phrases" % len(freq_terms)) """ Find Phrases """import time start = time.time() # Find all phrases up to length MAX_PHRASE_LEN at or above the defined MIN_DOC_FREQ abovephrase_doc_freq = defaultdict(int)for term in freq_terms: phrase_doc_freq[tuple([term])] = doc_freq[term] # data structure is a list of list (document) of pairs - sentences: (int, list (of tokens))# each item is a doc, a list of sents. each sent is a list of valid remaining phrases# seed with one valid phrase per sent #working_docs = [map(lambda sent: [sent], d) for d in tokenized_docs]working_docs = [map(lambda (sid, sent): (sid, [sent]), d) for d in tokenized_docs]working_freq_terms = set(freq_terms) # sentences with one or more phrases that are frequent enough (under the apriori algorithm closure priniciple)# don't bother whitling this down further at the start, almost all sentences have at least one freq term in themworking_sent_ids = set(sent_ids) """ Apriori-like Algorithm for Phrase Extraction """# use the downward closure principle from the apriori algorithm (https://en.wikipedia.org/wiki/Apriori_algorithm)# combined with an inverted index to very efficiently extract common phrasesfor phrase_len in range(2, config.max_phrase_length + 1): phrase_start = time.time() print "phrase_len", phrase_len print len(working_docs), "docs", len(working_freq_terms), "terms", len(working_sent_ids), "sentences" # for current phrase_len current_phrase_doc_freq = defaultdict(int) # used to look up sentence ids by phrase phrase2sentids = defaultdict(set) new_work_docs = [] for doc in working_docs: new_work_sents = [] unique_potential_phrases = set() for sent_id, phrases in doc: if sent_id not in working_sent_ids: continue
import os, re, timefrom bs4 import BeautifulSoupfrom nltk.tokenize import sent_tokenizefrom Utils.string_utils import clean_str, strip_non_asciifrom Utils.file_utils import find_files, delete_filesimport ntpath __REPL__ = ".\n" # Make common html tags line breaksdef pre_process_text(txt): txt = txt.replace("</li><li>", __REPL__).replace("<li>", __REPL__).replace("</li>", __REPL__) txt = txt.replace("<br>", __REPL__) txt = txt.replace("<br/>", __REPL__) txt = txt.replace("<br />", __REPL__) txt = txt.replace("<p>", __REPL__) txt = txt.replace("<p/>", __REPL__) txt = txt.replace("<p />", __REPL__) txt = txt.replace("</p>", __REPL__) txt = txt.replace(". .", __REPL__) txt = txt.replace(" ", " ") while ".." in txt: txt = txt.replace("..", ". ") while " " in txt: txt = txt.replace(" ", " ") return txt def visible(element): if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: return False elif re.match('<!--.*-->', strip_non_ascii(element)): return False return True def get_text(html): bs = BeautifulSoup(html) texts = bs.findAll(text=True) visible_texts = filter(visible, texts) return __REPL__.join(visible_texts) def parse_html(html): txt = get_text(pre_process_text(html)) return txt def split_into_sentences(txt): txt = strip_non_ascii(txt) sents = map(clean_str,sent_tokenize(txt)) return filter(lambda s: len(s.strip()) > 5, sents) ntpath.basename("a/b/c")def get_file_name(path): head, tail = ntpath.split(path) return tail or ntpath.basename(head) """ Process Files """import sysfrom Config.pre_process_config import PreProcessConfig if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") #sys.argv[0] is this script file, sys.argv[1] should be the config fileconfig = PreProcessConfig(sys.argv[1]) start = time.time() if config.empty_processed_documents_folder: delete_files(config.processed_documents_folder, config.file_mask) files = find_files(config.documents_folder, config.file_mask, True)for i, fpath in enumerate(files): with open(fpath) as f: contents = f.read() if len(contents) < config.minimum_file_size_chars: continue if config.parse_html: contents = parse_html(contents) if len(contents) < config.minimum_file_size_chars: continue sents = split_into_sentences(contents) doc = "\n".join(sents) file_name = get_file_name(fpath) fout_name = config.processed_documents_folder + "/" + file_name.split(".")[0] + "_proc.txt" with open(fout_name, "w+") as fout: fout.write(doc) if i % 1000 == 0 and i > 0: print("%i documents processsed" % i)end = time.time()print("Loading and processing documents took %s seconds" % str(end - start))
def test_001_default_pattern_relpath(self): self.assertEqual( find_files(dj('..', 'features', 'a2')), set([ap('a2', 'c', 'c1.feature'), ap('a2', 'f1')]) )
def test_002_excludes_unix(self): self.assertEqual( find_files(_dir, '[ac]1.feature', ['a/c']), set([ap('a2', 'c', 'c1.feature'), ap('a', 'a1.feature')]) )
def test_000_single_file(self): self.assertEqual( find_files(dj('f1.feature')), set([ap('f1.feature')]) )