from Config.pre_process_config import PreProcessConfig

if len(sys.argv) != 2:
    raise Exception(
        "Incorrect number of arguments passed - one expected, the config file name"
    )

#sys.argv[0] is this script file, sys.argv[1] should be the config file
config = PreProcessConfig(sys.argv[1])

start = time.time()

if config.empty_processed_documents_folder:
    delete_files(config.processed_documents_folder, config.file_mask)

files = find_files(config.documents_folder, config.file_mask, True)
for i, fpath in enumerate(files):
    with open(fpath) as f:
        contents = f.read()
        if len(contents) < config.minimum_file_size_chars:
            continue
        if config.parse_html:
            contents = parse_html(contents)
            if len(contents) < config.minimum_file_size_chars:
                continue

        sents = split_into_sentences(contents)
        doc = "\n".join(sents)

        file_name = get_file_name(fpath)
        fout_name = config.processed_documents_folder + "/" + file_name.split(
if len(sys.argv) != 2:
    raise Exception("Incorrect number of arguments passed - one expected, the config file name")

# sys.argv[0] is this script file, sys.argv[1] should be the config file
config = ExtractKeywordsConfig(sys.argv[1])
script_start = time.time()

if config.stop_words_file:
    stop_words = load_stop_words(config.stop_words_file)
    print ("%i stop words loaded" % len(stop_words))
else:
    stop_words = set()

""" Load Documents """
start = time.time()
files = find_files(config.processed_documents_folder, config.file_mask, True)
print ("%s files found in %s" % (len(files), config.processed_documents_folder))
documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        documents.append(contents.split("\n"))
end = time.time()
print ("Loading %i documents took %s seconds" % (len(files), str(end - start)))

""" Extract Common Terms and Phrases """
start = time.time()
# Or use a counter here.
doc_freq = defaultdict(int)

# remove short docs
Ejemplo n.º 3
0
#  - TODO: use functional composition to speed up
is_a_synonym_filter = fact_is_synonym_filter(syn_mapper)
analysis_chain = [
    clean_str, white_space_tokenize, remove_punct_at_end_filter,
    lower_case_filter, stop_filter, syn_mapper.map_synonyms,
    remove_empty_tokens_filter
]
# is_a_synonym_filter] - Un-comment to just train on keywords.

#Test
#rslt = debug_analyze("$150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management", analysis_chain)
""" Load Documents """
start = time.time()

sentences = []
files = find_files(config.processed_documents_folder, config.file_mask, True)
print("%s files found in %s" % (len(files), config.processed_documents_folder))

documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        sentences.extend(contents.split("\n"))
end = time.time()
print("Loading %i sentences took %s seconds" %
      (len(sentences), str(end - start)))
""" Analyze - clean, tokenize, extract phrases """
print("%i sentences to process" % len(sentences))

tokenized = []
print("Tokenizing sentences")
""" Process Files """
import sys
from Config.pre_process_config import PreProcessConfig

if len(sys.argv) != 2:
    raise Exception("Incorrect number of arguments passed - one expected, the config file name")

#sys.argv[0] is this script file, sys.argv[1] should be the config file
config = PreProcessConfig(sys.argv[1])

start = time.time()

if config.empty_processed_documents_folder:
    delete_files(config.processed_documents_folder, config.file_mask)

files = find_files(config.documents_folder, config.file_mask, True)
for i, fpath in enumerate(files):
    with open(fpath) as f:
        contents = f.read()
        if len(contents) < config.minimum_file_size_chars:
            continue
        if config.parse_html:
            contents = parse_html(contents)
            if len(contents) < config.minimum_file_size_chars:
                continue

        sents = split_into_sentences(contents)
        doc = "\n".join(sents)

        file_name = get_file_name(fpath)
        fout_name = config.processed_documents_folder + "/" + file_name.split(".")[0] + "_proc.txt"