Example #1
0
def markContent(text, stopwords_pattern):
    desc_0 = text.lower()
    desc_1 = PD.expandContractions(desc_0)
    desc_2 = PD.separateAttachedWords(desc_1)

    desc_3 = (re.compile(r'(\s-\s)')).sub(
        repl=" ; ", string=desc_2)  # isolated hyphens transformation step
    docText = stopwords_pattern.sub(repl=" STOPWORD ", string=desc_3)
    desc_ls = nltk.tokenize.word_tokenize(docText)

    desc_ls_1 = list(map(PD.removeStartingApostrophe, desc_ls))
    desc_ls_2 = PD.joinSeparatedHyphens(desc_ls_1)
    coded_ls = list(map(lambda word: markAsDelimiter(word), desc_ls_2))

    return coded_ls
def process_all_mdinfo(prods_in_df, outfilepath, phrases_model, d2v_model):
    MyUtils.init_logging("ExtractMetadataInfo.log")

    f = open(outfilepath, "w")
    f.close()  #clean between runs
    sw_pattern = PPD.getStopwordsPattern(includePunctuation=True)
    logging.info("Started postprocessing other metadata info")

    segment_nrows = 5 * 10**4
    logging.info("Number of elements in a segment: %s", str(segment_nrows))
    with open(outfilepath, "a") as out_file:
        out_file.write("_id_description_price_titlevec_mdcategories\n")
        for input_segment in pd.read_csv(prods_in_df,
                                         chunksize=segment_nrows,
                                         sep="_"):
            chunk_start = time()
            mdinfo_lts = []
            for prod_tupl in input_segment.itertuples():
                prodinfo_tuple = process_prodinfo(prod_tupl, phrases_model,
                                                  d2v_model, sw_pattern)
                mdinfo_lts.append(prodinfo_tuple)
            pd.DataFrame(mdinfo_lts).to_csv(out_file,
                                            mode="a",
                                            header=False,
                                            sep="_")
            chunk_end = time()
            logging.info(
                "Processing: other metadata info. Segment completed in time : %s seconds",
                str(round(chunk_end - chunk_start, 3)))
    logging.info("Completed: processing product metadata.")
def createQuestionDocuments():
    MyUtils.init_logging("PreprocessQuestions.log")
    ds = open(F.QADOCS_RAW, 'w')  #cleaning the file between runs
    ds.close()
    start_creatingInput = time.time()

    # Method: itertuples + pickle. Objective: preprocess text and create TaggedDocuments
    sw_pattern = PPD.getStopwordsPattern(includePunctuation=False)
    punct_pattern = re.compile(
        r'([!"#$%&()*+,./:;<=>?@\[\\\]^_`{|}-~\'])|([--])')
    chunk_length = 0.5 * (10**5)
    with open(F.QADOCS_RAW, "a") as qadocs_file:
        qadocs_file.write(",words,tags\n")
        for input_segment in pd.read_csv(RQ.QA_TRAIN_DFPATH,
                                         chunksize=chunk_length,
                                         sep="_"):
            chunk_0 = map(
                lambda tupl: createDocForRow(tupl, sw_pattern, punct_pattern),
                input_segment.itertuples())
            chunk_1 = list(filter(lambda x: x is not None, chunk_0))
            print(
                getsizeof(chunk_1) // (2**10)
            )  # debugging : get size of the chunk in Kilobytes. It also works as a progress update
            pd.DataFrame(chunk_1).to_csv(path_or_buf=qadocs_file,
                                         mode="a",
                                         header=False)
            logging.info("Chunk of documents created...")

    end_creatingInput = time.time()
    logging.info("Time spent creating the Documents: %s",
                 str(round(end_creatingInput - start_creatingInput, 3)))
Example #4
0
def test_my_rake():
    the_stopwords_pattern = PD.getStopwordsPattern(includePunctuation=True)
    md_df = RM.load_md(RM.READKEYWORD_TRAINSUBSET)
    elem = MyUtils.pickRandomElement(md_df)
    while elem.description == "nan" or len(
            elem.description
    ) == 0:  #a null value may be nan (for prods) or '' (for quests)
        elem = MyUtils.pickRandomElement(md_df)
    apply_my_rake(elem.description, the_stopwords_pattern)
def createDocForTitle(title_text, stopwords_pattern):
    # Method2: preprocess and then use word_tokenize, that implicitly calls the sentence tokenizer
    title_0 = title_text.lower()
    title_1 = PPD.expandContractions(title_0)

    title_2 = stopwords_pattern.sub(repl=" ",
                                    string=title_1)  # stopwords removal step

    titleWords_1 = nltk.tokenize.word_tokenize(title_2)

    row_doc = TaggedDocument(words=titleWords_1, tags=[0])  #(id not used)

    return row_doc
def create_the_models():
    MyUtils.init_logging("Encode_Common.log")
    PPD.createDescriptionDocuments(
    )  # creates and saves version 1.0 of the docs, before phrases
    PPQ.createQuestionDocuments()
    collect()  # manual garbage collection

    create_phrases_model()
    prepare_dq_documents()  # v.1.1 of the docs, after phrases

    collect()  # manual garbage collection
    VD.create_docvectors_model(
    )  # the Doc2Vec model, with the vectors of the training subset
    collect()

    d2v_model = VD.load_model()
    logging.info("d2v_model, memory size in MBs = %s",
                 str(mem.asizeof(d2v_model) // 2**20))
    phrases_model = phrases.Phrases.load(F.PHRASES_MODEL)
    logging.info("phrases_model, memory size in MBs = %s",
                 str(mem.asizeof(phrases_model) // 2**20))

    logging.info("Doc2Vec and Phrases models loaded.")
    return (d2v_model, phrases_model)
def createDocForRow(row, stopwords_pattern, punct_pattern):
    #print(row)
    row_asin = row.asin  #row["asin"]
    row_unixtime = row.unixTime
    row_id = row_asin + "@" + str(row_unixtime)
    row_desc = str(row.question)  #str(row["description"])

    if row_desc != 'nan' and len(row_desc) > 0:

        #Method2: preprocess and then use word_tokenize, that implicitly calls the sentence tokenizer
        row_desc_0 = row_desc.lower()
        row_desc_1 = PPD.expandContractions(row_desc_0)
        row_desc_2 = PPD.separateAttachedWords(row_desc_1)

        row_desc_3 = (re.compile(r'(\s-\s)')).sub(
            repl=" ; ",
            string=row_desc_2)  # isolated hyphens transformation step
        docText = stopwords_pattern.sub(
            repl=" ", string=row_desc_3)  #stopwords removal step

        docWords_1 = nltk.tokenize.word_tokenize(docText)
        docWords_2 = list(map(PPD.removeStartingApostrophe, docWords_1))
        docWords_3 = PPD.joinSeparatedHyphens(docWords_2)

        # all punctuation signs that were not filtered
        docWords_4 = list(
            filter(
                lambda w: True
                if (bool(punct_pattern.match(w)) == False) else False,
                docWords_3))
        #if len(docWords_3) < len(docWords_2):
        #    print("Something has sbeen eliminated, " + str(len(docWords_3) - len(docWords_2)))

        row_doc = D2V.TaggedDocument(words=docWords_4, tags=[row_id])

        return row_doc
Example #8
0
def my_rake_exe(in_df_filepath, elementTextAttribute, threshold_fraction,
                out_kwsdf_filepath):
    #logging.basicConfig(filename="MyRAKE.log", level=logging.DEBUG,
    #                    format='%(asctime)s -%(levelname)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    MyUtils.init_logging(logfilename="MyRAKE.log")
    logging.info("Keyword extraction started.")
    sw_pattern = PD.getStopwordsPattern(includePunctuation=True)
    numbers_pattern = re.compile(r'([0-9])+')  #numbers are not keywords
    allsw_expression = "|".join([sw_pattern.pattern, numbers_pattern.pattern])
    allsw_pattern = re.compile(allsw_expression)
    f = open(out_kwsdf_filepath, "w")
    f.close()  #clean between runs

    segment_nrows = int(1.0 * 10**4)
    current_segment = 1
    logging.info("Number of elements in a segment: " + str(segment_nrows))

    with open(out_kwsdf_filepath, "a") as outfile:
        outfile.write(",id,keywords\n")
        with open(in_df_filepath, "r") as in_df_file:
            #logging.info("current subfile being processed: %s", traindf_filepath)
            for input_segment in pd.read_csv(in_df_file,
                                             chunksize=segment_nrows,
                                             sep="_",
                                             engine='c',
                                             error_bad_lines=False):
                executor = pathos.pools.ProcessPool(
                    max(1,
                        multiprocessing.cpu_count() - 1))
                if len(input_segment) < segment_nrows:
                    logging.warning(
                        "Segment with length %s < %s ;\n" +
                        "either lines with unreadable characters were dropped,"
                        + "or this is the last chunk", len(input_segment),
                        segment_nrows)
                seg_start = time()
                #seg_lts = []
                args = [
                    (MyUtils.refine_tuple(element), elementTextAttribute,
                     allsw_pattern, input_segment.columns, threshold_fraction)
                    for element in input_segment.itertuples()
                ]
                #logging.info("The arguments for the current segment have been created. Length: %s", len(args))
                seg_lts_map = executor.map(map_applymyrake,
                                           args)  #executor.map
                #logging.info("Mapping operation completed: keywords created; proceeding to filter intermediate list...")
                seg_lts = list(filter(lambda x: x is not None, seg_lts_map))
                # for prod_tuple in input_segment.itertuples():
                #     logging.info(prod_tuple.asin)
                #     id_kws_tuple = apply_rake_to_element(prod_tuple, elementTextAttribute, allsw_pattern,
                #                                            input_segment.columns,threshold_fraction)
                #     if id_kws_tuple is not None:
                #         seg_lts.append(id_kws_tuple)
                #logging.info("List filtered; proceeding to save keywords to file...")
                pd.DataFrame(seg_lts).to_csv(outfile, mode="a", header=False)
                seg_end = time()
                logging.info(
                    "* Keyword extraction ; the segment n. %s of the input dataframe has been processed in %s seconds",
                    current_segment, str(round(seg_end - seg_start, 3)))
                executor.terminate()
                executor.restart()
                collect()
                current_segment = current_segment + 1
    logging.info("Keyword extraction : finished.")