import time, hashlib, refrom collections import defaultdictfrom Utils.string_utils import collapse_spacesfrom Utils.file_utils import find_files, load_stop_words
def compute_ngrams(tokens, max_len = None, min_len = 1): """    tokens  :   iterable of string                    a single sentence of tokens. Assumes start and stop tokens omitted    max_len :   int                    maximum ngram length    min_len :   int                    minimum ngram length """ if max_len == None:        max_len = len(tokens)
 if min_len > max_len: raise Exception("min_len cannot be more than max_len")
    ngrams = set() # unigrams for ngram_size in range(min_len, max_len + 1): for start in range(0, len(tokens) - ngram_size + 1):            end = start + ngram_size -1            words = [] for i in range(start, end + 1):                words.append(tokens[i])            ngrams.add(tuple(words)) # make a tuple so hashable return ngrams
# we may want to keep some non-alpha characters, such as # in C# and + in C++, etc.re1 = re.compile("[;:\'\"\*/\),\(\-\|\s]+")def remove_punct(s):    s = s.replace("'s"," ") return collapse_spaces(re1.sub(" ",s).strip())
def hash_string(s):    hash_object = hashlib.md5(b'%s' % s) return str(hash_object.hexdigest())
# recursive algorithm to eliminate shorter phrases with same or similar Doc Freq as longer phrasesdef find_sub_phrases_to_remove(tpl_phrase, valid_phrases, doc_freq, to_rem): if len(tpl_phrase) <= 1: return    phrase_df = doc_freq[tpl_phrase]    ngrams = compute_ngrams(tpl_phrase, len(tpl_phrase)-1, 1) for tpl_ngram in ngrams: if tpl_ngram in valid_phrases and tpl_ngram not in to_rem:            sub_phr_df = doc_freq[tpl_ngram] # if sub_phrase_df is close to the same frequency if phrase_df >= (0.9 * sub_phr_df):                to_rem.add(tpl_ngram) #to_rem_dbg.add((tpl_phrase, tpl_ngram, phrase_df, sub_phr_df))                find_sub_phrases_to_remove(tpl_ngram, valid_phrases, doc_freq, to_rem)
""" Extract Phrases """import sysfrom Config.extract_keywords_config import ExtractKeywordsConfig
if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name")
#sys.argv[0] is this script file, sys.argv[1] should be the config fileconfig = ExtractKeywordsConfig(sys.argv[1])script_start = time.time()
if config.stop_words_file:    stop_words = load_stop_words(config.stop_words_file) print("%i stop words loaded" % len(stop_words))else:    stop_words = set()
""" Load Documents """start = time.time()files = find_files(config.processed_documents_folder, config.file_mask, True)print("%s files found in %s" % (len(files), config.processed_documents_folder))documents = []for i, fname in enumerate(files): with open(fname) as f:        contents = f.read()        documents.append(contents.split("\n"))end = time.time()print("Loading %i documents took %s seconds" % (len(files), str(end - start)))
""" Extract Common Terms and Phrases """start = time.time()#Or use a counter here.doc_freq = defaultdict(int)
# remove short docstokenized_docs = []sent_id = 0sent_ids = set()lens = []hashed = set()
""" Find single word keywords """for doc in documents:    un_tokens = set()    tok_sents = [] for sent in doc:        cl_sent = remove_punct(sent.lower())        hash_sent = hash_string(cl_sent) # remove dupe sentences (not - will hurt df accuracy a little) if hash_sent in hashed: continue        hashed.add(hash_sent)
        tokens = tuple(cl_sent.split(" "))        lens.append(len(tokens))        sent_id += 1        tok_sents.append((sent_id, tokens))        sent_ids.add(sent_id)
 # create inverted index and unique tokens (for doc freq calc)        proc_tokens = set() for tok in tokens: if not tok in proc_tokens:                proc_tokens.add(tok) if not tok in un_tokens:                    un_tokens.add(tok)                    doc_freq[tok] += 1
 if len(tok_sents) > 0:        tokenized_docs.append(tok_sents)
end = time.time()print("Extracting Keywords from %i documents took %i secs" % (len(tokenized_docs), end-start))
# Get really frequent items for removalnum_docs = float(len(tokenized_docs))above_threshold = [k for k,v in doc_freq.items() if v >= config.min_document_frequency]
# remove really frequent terms (in 50% or more of documents)too_freq = set([k for k in above_threshold if (doc_freq[k]/num_docs) >= config.max_proportion_documents])
freq_terms = [k for k in above_threshold if  k not in stop_words and                  k not in too_freq]print("%s frequent terms identified for building phrases" % len(freq_terms))
""" Find Phrases """import time
start = time.time()
# Find all phrases up to length MAX_PHRASE_LEN at or above the defined MIN_DOC_FREQ abovephrase_doc_freq = defaultdict(int)for term in freq_terms:    phrase_doc_freq[tuple([term])] = doc_freq[term]
# data structure is a list of list (document) of pairs - sentences: (int, list  (of tokens))# each item is a doc, a list of sents. each sent is a list of valid remaining phrases# seed with one valid phrase per sent
#working_docs = [map(lambda sent: [sent], d) for d in tokenized_docs]working_docs = [map(lambda (sid, sent): (sid, [sent]), d) for d in tokenized_docs]working_freq_terms = set(freq_terms)
# sentences with one or more phrases that are frequent enough (under the apriori algorithm closure priniciple)# don't bother whitling this down further at the start, almost all sentences have at least one freq term in themworking_sent_ids = set(sent_ids)
""" Apriori-like Algorithm for Phrase Extraction """# use the downward closure principle from the apriori algorithm (https://en.wikipedia.org/wiki/Apriori_algorithm)#  combined with an inverted index to very efficiently extract common phrasesfor phrase_len in range(2, config.max_phrase_length + 1):    phrase_start = time.time() print "phrase_len", phrase_len print len(working_docs), "docs", len(working_freq_terms), "terms", len(working_sent_ids), "sentences" # for current phrase_len    current_phrase_doc_freq = defaultdict(int)
 # used to look up sentence ids by phrase    phrase2sentids = defaultdict(set)
    new_work_docs = [] for doc in working_docs:        new_work_sents = []        unique_potential_phrases = set() for sent_id, phrases in doc: if sent_id not in working_sent_ids: continue
Esempio n. 2
0
import os, re, timefrom bs4 import BeautifulSoupfrom nltk.tokenize import sent_tokenizefrom Utils.string_utils import clean_str, strip_non_asciifrom Utils.file_utils import find_files, delete_filesimport ntpath
__REPL__ = ".\n"
# Make common html tags line breaksdef pre_process_text(txt):    txt = txt.replace("</li><li>", __REPL__).replace("<li>", __REPL__).replace("</li>", __REPL__)    txt = txt.replace("<br>", __REPL__)    txt = txt.replace("<br/>", __REPL__)    txt = txt.replace("<br />", __REPL__)    txt = txt.replace("<p>",  __REPL__)    txt = txt.replace("<p/>",  __REPL__)    txt = txt.replace("<p />",  __REPL__)    txt = txt.replace("</p>", __REPL__)    txt = txt.replace(". .",  __REPL__)    txt = txt.replace("&nbsp;", " ") while ".." in txt:        txt = txt.replace("..", ". ") while " " in txt:        txt = txt.replace(" ", " ") return txt
def visible(element): if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: return False elif re.match('<!--.*-->', strip_non_ascii(element)): return False return True
def get_text(html):    bs = BeautifulSoup(html)    texts = bs.findAll(text=True)    visible_texts = filter(visible, texts) return __REPL__.join(visible_texts)
def parse_html(html):    txt = get_text(pre_process_text(html)) return txt
def split_into_sentences(txt):    txt = strip_non_ascii(txt)    sents = map(clean_str,sent_tokenize(txt)) return filter(lambda s: len(s.strip()) > 5, sents)
ntpath.basename("a/b/c")def get_file_name(path):    head, tail = ntpath.split(path) return tail or ntpath.basename(head)
""" Process Files """import sysfrom Config.pre_process_config import PreProcessConfig
if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name")
#sys.argv[0] is this script file, sys.argv[1] should be the config fileconfig = PreProcessConfig(sys.argv[1])
start = time.time()
if config.empty_processed_documents_folder:    delete_files(config.processed_documents_folder, config.file_mask)
files = find_files(config.documents_folder, config.file_mask, True)for i, fpath in enumerate(files): with open(fpath) as f:        contents = f.read() if len(contents) < config.minimum_file_size_chars: continue if config.parse_html:            contents = parse_html(contents) if len(contents) < config.minimum_file_size_chars: continue
        sents = split_into_sentences(contents)        doc = "\n".join(sents)
        file_name = get_file_name(fpath)        fout_name = config.processed_documents_folder + "/" + file_name.split(".")[0] + "_proc.txt" with open(fout_name, "w+") as fout:            fout.write(doc) if i % 1000 == 0 and i > 0: print("%i documents processsed" % i)end = time.time()print("Loading and processing documents took %s seconds" % str(end - start))
Esempio n. 3
0
 def test_001_default_pattern_relpath(self):
     self.assertEqual(
         find_files(dj('..', 'features', 'a2')),
         set([ap('a2', 'c', 'c1.feature'), ap('a2', 'f1')])
     )
Esempio n. 4
0
 def test_002_excludes_unix(self):
     self.assertEqual(
         find_files(_dir, '[ac]1.feature', ['a/c']),
         set([ap('a2', 'c', 'c1.feature'), ap('a', 'a1.feature')])
     )
Esempio n. 5
0
 def test_000_single_file(self):
     self.assertEqual(
         find_files(dj('f1.feature')),
         set([ap('f1.feature')])
     )