def compute_term_features(doc_list, global_term_dict, df_raw, store_filename): import math, tabulate, ioData # reindex df_raw by link column df = df_raw.set_index('link') feature_extractor = utils.TextualFeatureExtractor(df) nr_docs = len(doc_list) nr_terms = reduce(lambda x, y: x + len(y.relevant_terms), doc_list, 0) term_dataset = [] for doc in doc_list: print "======== " + doc.url + " ========" meta_keywords = df.loc[doc.url]['keywords'] # 1) compute TF # print "Computing TF ... " doc.compute_tf() # print "Document parsed: " # print doc.transformed # print "Computing DF, TFIDF and Textual Features ... " for term in doc.relevant_terms: # 2) compute DF and TFIDF - use global_term_dict term.df = global_term_dict[term] term.tfidf = term.tf * math.log(1 + float(nr_docs) / float(term.df), 2) # print tabulate.tabulate([[term, term.cvalue, term.tf, term.df, term.tfidf]], headers=('term', 'cval', 'tf', 'df', 'tfidf')) # print "" # 3) compute linguistic features term.set_textual_feature_extractor(feature_extractor) term.extract_textual_features() # print tabulate.tabulate([[term, term.is_title, term.is_url, term.is_first_par, term.is_last_par, term.is_description, # term.is_img_caption, term.is_anchor, term.doc_position]], # headers=('term', 'is_title', 'is_url', 'is_first_par', 'is_last_par', 'is_description', 'is_img_desc', 'is_anchor', 'doc_pos')) # 4) check if term is in meta keywords (i.e is relevant) term.is_keyword = is_meta_keyword(term, meta_keywords) # print ":: IS RELEVANT = " + str(term.is_keyword) # print ("\n\n") term_dataset.append([term.original, doc.url, term.cvalue, term.tf, term.df, term.tfidf, term.is_title, term.is_url, term.is_first_par, term.is_last_par, term.is_description, term.is_img_caption, term.is_anchor, term.doc_position, term.is_keyword]) term_df_headers = ['term', 'doc_url', 'cvalue', 'tf', 'df', 'tfidf', 'is_title', 'is_url', 'is_first_par', 'is_last_par', 'is_description', 'is_img_desc', 'is_anchor', 'doc_pos', 'relevant'] term_df = pd.DataFrame(term_dataset, columns=term_df_headers) ioData.writeData(term_df, store_filename)
def create_term_train_test_dataset(global_term_feature_file, extracted_test_terms_file, train_feature_file, test_feature_file): import ioData as io df = io.readData(global_term_feature_file) cvalRes = None with open(extracted_test_terms_file) as fp: cvalRes = json.load(fp, encoding="utf-8") test_urls = cvalRes.keys() test_df = df.loc[df['doc_url'].isin(test_urls)] io.writeData(test_df, test_feature_file) train_df = df.loc[~df.index.isin(test_df.index)] io.writeData(train_df, train_feature_file)
def has_jwplayer(paragraphs): for p in paragraphs: s = sum(map(lambda x : x.count("jwplayer("), p)) if s > 0: return True return False if __name__ == "__main__": import ioData TRAIN_DATASET_FILE = "dataset/term-feature-train-dataset-v3.json" TEST_DATASET_FILE = "dataset/term-feature-test-dataset-v3.json" grapeshot_df = pd.read_excel("dataset/meta_keywords_overlaps.xlsx", "Overlaps") grapeshot_df = grapeshot_df[["URL", "Keywords"]] grapeshot_df.columns = ['url', 'keywords'] grapeshot_df['keywords'] = grapeshot_df['keywords'].map(lambda x: x.lower()) extracted_df = extract_test_keywords(TRAIN_DATASET_FILE, TEST_DATASET_FILE, retrain=True) # merge dataframes comparison_df = pd.merge(extracted_df, grapeshot_df, on='url') comparison_df['overlap'] = comparison_df.apply(extracted_keyterms_overlap, axis = 1) ioData.writeData(comparison_df, "dataset/comparison_df_v3.json") comparison_df.to_excel("dataset/comparison_df_v3.xlsx", "Overlap") print comparison_df.describe()