def markContent(text, stopwords_pattern): desc_0 = text.lower() desc_1 = PD.expandContractions(desc_0) desc_2 = PD.separateAttachedWords(desc_1) desc_3 = (re.compile(r'(\s-\s)')).sub( repl=" ; ", string=desc_2) # isolated hyphens transformation step docText = stopwords_pattern.sub(repl=" STOPWORD ", string=desc_3) desc_ls = nltk.tokenize.word_tokenize(docText) desc_ls_1 = list(map(PD.removeStartingApostrophe, desc_ls)) desc_ls_2 = PD.joinSeparatedHyphens(desc_ls_1) coded_ls = list(map(lambda word: markAsDelimiter(word), desc_ls_2)) return coded_ls
def process_all_mdinfo(prods_in_df, outfilepath, phrases_model, d2v_model): MyUtils.init_logging("ExtractMetadataInfo.log") f = open(outfilepath, "w") f.close() #clean between runs sw_pattern = PPD.getStopwordsPattern(includePunctuation=True) logging.info("Started postprocessing other metadata info") segment_nrows = 5 * 10**4 logging.info("Number of elements in a segment: %s", str(segment_nrows)) with open(outfilepath, "a") as out_file: out_file.write("_id_description_price_titlevec_mdcategories\n") for input_segment in pd.read_csv(prods_in_df, chunksize=segment_nrows, sep="_"): chunk_start = time() mdinfo_lts = [] for prod_tupl in input_segment.itertuples(): prodinfo_tuple = process_prodinfo(prod_tupl, phrases_model, d2v_model, sw_pattern) mdinfo_lts.append(prodinfo_tuple) pd.DataFrame(mdinfo_lts).to_csv(out_file, mode="a", header=False, sep="_") chunk_end = time() logging.info( "Processing: other metadata info. Segment completed in time : %s seconds", str(round(chunk_end - chunk_start, 3))) logging.info("Completed: processing product metadata.")
def createQuestionDocuments(): MyUtils.init_logging("PreprocessQuestions.log") ds = open(F.QADOCS_RAW, 'w') #cleaning the file between runs ds.close() start_creatingInput = time.time() # Method: itertuples + pickle. Objective: preprocess text and create TaggedDocuments sw_pattern = PPD.getStopwordsPattern(includePunctuation=False) punct_pattern = re.compile( r'([!"#$%&()*+,./:;<=>?@\[\\\]^_`{|}-~\'])|([--])') chunk_length = 0.5 * (10**5) with open(F.QADOCS_RAW, "a") as qadocs_file: qadocs_file.write(",words,tags\n") for input_segment in pd.read_csv(RQ.QA_TRAIN_DFPATH, chunksize=chunk_length, sep="_"): chunk_0 = map( lambda tupl: createDocForRow(tupl, sw_pattern, punct_pattern), input_segment.itertuples()) chunk_1 = list(filter(lambda x: x is not None, chunk_0)) print( getsizeof(chunk_1) // (2**10) ) # debugging : get size of the chunk in Kilobytes. It also works as a progress update pd.DataFrame(chunk_1).to_csv(path_or_buf=qadocs_file, mode="a", header=False) logging.info("Chunk of documents created...") end_creatingInput = time.time() logging.info("Time spent creating the Documents: %s", str(round(end_creatingInput - start_creatingInput, 3)))
def test_my_rake(): the_stopwords_pattern = PD.getStopwordsPattern(includePunctuation=True) md_df = RM.load_md(RM.READKEYWORD_TRAINSUBSET) elem = MyUtils.pickRandomElement(md_df) while elem.description == "nan" or len( elem.description ) == 0: #a null value may be nan (for prods) or '' (for quests) elem = MyUtils.pickRandomElement(md_df) apply_my_rake(elem.description, the_stopwords_pattern)
def createDocForTitle(title_text, stopwords_pattern): # Method2: preprocess and then use word_tokenize, that implicitly calls the sentence tokenizer title_0 = title_text.lower() title_1 = PPD.expandContractions(title_0) title_2 = stopwords_pattern.sub(repl=" ", string=title_1) # stopwords removal step titleWords_1 = nltk.tokenize.word_tokenize(title_2) row_doc = TaggedDocument(words=titleWords_1, tags=[0]) #(id not used) return row_doc
def create_the_models(): MyUtils.init_logging("Encode_Common.log") PPD.createDescriptionDocuments( ) # creates and saves version 1.0 of the docs, before phrases PPQ.createQuestionDocuments() collect() # manual garbage collection create_phrases_model() prepare_dq_documents() # v.1.1 of the docs, after phrases collect() # manual garbage collection VD.create_docvectors_model( ) # the Doc2Vec model, with the vectors of the training subset collect() d2v_model = VD.load_model() logging.info("d2v_model, memory size in MBs = %s", str(mem.asizeof(d2v_model) // 2**20)) phrases_model = phrases.Phrases.load(F.PHRASES_MODEL) logging.info("phrases_model, memory size in MBs = %s", str(mem.asizeof(phrases_model) // 2**20)) logging.info("Doc2Vec and Phrases models loaded.") return (d2v_model, phrases_model)
def createDocForRow(row, stopwords_pattern, punct_pattern): #print(row) row_asin = row.asin #row["asin"] row_unixtime = row.unixTime row_id = row_asin + "@" + str(row_unixtime) row_desc = str(row.question) #str(row["description"]) if row_desc != 'nan' and len(row_desc) > 0: #Method2: preprocess and then use word_tokenize, that implicitly calls the sentence tokenizer row_desc_0 = row_desc.lower() row_desc_1 = PPD.expandContractions(row_desc_0) row_desc_2 = PPD.separateAttachedWords(row_desc_1) row_desc_3 = (re.compile(r'(\s-\s)')).sub( repl=" ; ", string=row_desc_2) # isolated hyphens transformation step docText = stopwords_pattern.sub( repl=" ", string=row_desc_3) #stopwords removal step docWords_1 = nltk.tokenize.word_tokenize(docText) docWords_2 = list(map(PPD.removeStartingApostrophe, docWords_1)) docWords_3 = PPD.joinSeparatedHyphens(docWords_2) # all punctuation signs that were not filtered docWords_4 = list( filter( lambda w: True if (bool(punct_pattern.match(w)) == False) else False, docWords_3)) #if len(docWords_3) < len(docWords_2): # print("Something has sbeen eliminated, " + str(len(docWords_3) - len(docWords_2))) row_doc = D2V.TaggedDocument(words=docWords_4, tags=[row_id]) return row_doc
def my_rake_exe(in_df_filepath, elementTextAttribute, threshold_fraction, out_kwsdf_filepath): #logging.basicConfig(filename="MyRAKE.log", level=logging.DEBUG, # format='%(asctime)s -%(levelname)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') MyUtils.init_logging(logfilename="MyRAKE.log") logging.info("Keyword extraction started.") sw_pattern = PD.getStopwordsPattern(includePunctuation=True) numbers_pattern = re.compile(r'([0-9])+') #numbers are not keywords allsw_expression = "|".join([sw_pattern.pattern, numbers_pattern.pattern]) allsw_pattern = re.compile(allsw_expression) f = open(out_kwsdf_filepath, "w") f.close() #clean between runs segment_nrows = int(1.0 * 10**4) current_segment = 1 logging.info("Number of elements in a segment: " + str(segment_nrows)) with open(out_kwsdf_filepath, "a") as outfile: outfile.write(",id,keywords\n") with open(in_df_filepath, "r") as in_df_file: #logging.info("current subfile being processed: %s", traindf_filepath) for input_segment in pd.read_csv(in_df_file, chunksize=segment_nrows, sep="_", engine='c', error_bad_lines=False): executor = pathos.pools.ProcessPool( max(1, multiprocessing.cpu_count() - 1)) if len(input_segment) < segment_nrows: logging.warning( "Segment with length %s < %s ;\n" + "either lines with unreadable characters were dropped," + "or this is the last chunk", len(input_segment), segment_nrows) seg_start = time() #seg_lts = [] args = [ (MyUtils.refine_tuple(element), elementTextAttribute, allsw_pattern, input_segment.columns, threshold_fraction) for element in input_segment.itertuples() ] #logging.info("The arguments for the current segment have been created. Length: %s", len(args)) seg_lts_map = executor.map(map_applymyrake, args) #executor.map #logging.info("Mapping operation completed: keywords created; proceeding to filter intermediate list...") seg_lts = list(filter(lambda x: x is not None, seg_lts_map)) # for prod_tuple in input_segment.itertuples(): # logging.info(prod_tuple.asin) # id_kws_tuple = apply_rake_to_element(prod_tuple, elementTextAttribute, allsw_pattern, # input_segment.columns,threshold_fraction) # if id_kws_tuple is not None: # seg_lts.append(id_kws_tuple) #logging.info("List filtered; proceeding to save keywords to file...") pd.DataFrame(seg_lts).to_csv(outfile, mode="a", header=False) seg_end = time() logging.info( "* Keyword extraction ; the segment n. %s of the input dataframe has been processed in %s seconds", current_segment, str(round(seg_end - seg_start, 3))) executor.terminate() executor.restart() collect() current_segment = current_segment + 1 logging.info("Keyword extraction : finished.")