def main(): # TODO: need to agree on a place to put data. #subforum = "webmasters" for ele in list_of_subforums: data_directory = "../data/cqadupstack/" + ele + ".zip" o = qcqa.load_subforum(data_directory) # Do train/test split # Writes output files, takes a minute... # TODO: can we make this output the files, outside of the git repo? o.split_for_classification()
def extract_dataset(name, path_to_zip, output_path): print("Loading the %s subforum" % name) src = qcqa.load_subforum("%s/%s.zip" % (path_to_zip, name)) current_path = os.getcwd() if not output_path.startswith("/"): output_path = "%s/%s" % (current_path, output_path) # Remove trailing "/" output_path = output_path.rstrip("/") try: os.stat(output_path) except: try: os.mkdir(output_path) except: print("output path %s could not be created" % output_path) exit(-1) full_path = "%s/%s" % (output_path, name) try: os.stat(full_path) except: try: os.mkdir(full_path) except: print("output path %s could not be created" % full_path) exit(-1) output = open("%s/%s_src.pkl" % (full_path, name), 'w+') pickle.dump(src, output) output.close() try: os.chdir(full_path) print("Splitting %s for classification into output path %s" % (name, full_path)) src.split_for_classification() except: print("Could not access output path") exit(-1) os.chdir(current_path)
def get_duplicate_pairs(subforum): data_directory = "../data/cqadupstack/" + subforum + ".zip" o = qcqa.load_subforum(data_directory) print("collecting duplicate pairs...") # Step 1: get all unique pairs duplicate_pairs = set() posts_with_duplicates = o.get_posts_with_duplicates() for post in posts_with_duplicates: duplicates = o.get_duplicates(post) for dup_candidate in duplicates: min_post = unicode(str(min(int(post), int(dup_candidate))), "utf-8") max_post = unicode(str(max(int(post), int(dup_candidate))), "utf-8") duplicate_pairs.add((min_post, max_post)) num_duplicate_pairs = len(duplicate_pairs) print("Total number of unique duplicate pairs is:", len(duplicate_pairs)) return duplicate_pairs, num_duplicate_pairs
def extract_test_data(subforum, testflag="small"): """ Converts test data from a particular subforum into a collections container. """ print("Starting extraction of test data...") # populate train and valid collections data_directory = "../data/cqadupstack/" + subforum + ".zip" o = qcqa.load_subforum(data_directory) unique_test_ID_set = set() test_dup = [] with open(subforum + "_testpairs_" + testflag + ".txt") as f: for line in f: data = line.split(" ") (post1, post2, dup) = (unicode(data[0], "utf-8"), unicode(data[1], "utf-8"), unicode(data[2], "utf-8")) unique_test_ID_set.add(post1) unique_test_ID_set.add(post2) test_dup += [[post1, post2, dup]] unique_test_ID_list = list(unique_test_ID_set) # Makes list of post bodies for test data test_post_body_list = [ o.perform_cleaning(o.get_postbody(ID), remove_stopwords=True) for ID in unique_test_ID_list ] test_post_title_list = [ o.perform_cleaning(o.get_posttitle(ID), remove_stopwords=True) for ID in unique_test_ID_list ] test_post_both_list = [ o.perform_cleaning(o.get_post_title_and_body(ID), remove_stopwords=True) for ID in unique_test_ID_list ] test_reputation = { ID: (o.get_user_reputation(o.get_postuserid(ID)) if o.get_postuserid(ID) != False else 0) for ID in unique_test_ID_list } test_scores = {ID: o.get_postscore(ID) for ID in unique_test_ID_list} test_collection = DatasetContainer(unique_test_ID_list, test_dup, test_post_body_list, test_post_title_list, test_post_both_list, None, None, None, test_reputation, test_scores) # fill in tf matrices test_tf_body, feature_names = vectorize(test_collection.bodies) test_tf_title, feature_names = vectorize(test_collection.titles) test_tf_both, feature_names = vectorize(test_collection.combined) test_collection.tf_body = test_tf_body test_collection.tf_title = test_tf_title test_collection.tf_combined = test_tf_both return test_collection
def construct_train_valid(subforum): """Constructs the train and valid sets. Keyword arguments: subforum -- the subforum to construct the datasets if None, uses all subforum """ #TODO: collect all subforums if subforum=None data_directory = "../data/cqadupstack/" + subforum + ".zip" o = qcqa.load_subforum(data_directory) print("starting speedy data generation...") # Step 1: get all unique pairs duplicate_pairs, num_duplicate_pairs = get_duplicate_pairs(subforum) # Step 2: randomly sample pairs of posts that are not duplicates. Idea is to # have roughly a 1:1 ratio of dup to non-dup posts non_duplicate_pairs = set() while len(non_duplicate_pairs) < num_duplicate_pairs: (post1, post2, tag) = o.get_random_pair_of_posts() if tag == 'nondup': min_post = unicode(str(min(int(post1), int(post2))), "utf-8") max_post = unicode(str(max(int(post1), int(post2))), "utf-8") non_duplicate_pairs.add((min_post, max_post)) print("Total number of non-duplicate pairs is:", len(non_duplicate_pairs)) # Step 3: generate train/validation splits # randomly choose 1/10th of the data to be the validation num_validation = num_duplicate_pairs / 10 num_train = num_duplicate_pairs - num_validation print("validate with: ", num_validation, "train with: ", num_train) list_duplicate_pairs = np.array(list(duplicate_pairs)) list_duplicate_pairs = np.concatenate( (list_duplicate_pairs, np.ones(shape=(num_duplicate_pairs, 1))), axis=1) list_non_duplicate_pairs = np.array(list(non_duplicate_pairs)) list_non_duplicate_pairs = np.concatenate( (list_non_duplicate_pairs, np.zeros(shape=(num_duplicate_pairs, 1))), axis=1) # Choosing the validation indices (for dup, non-dup) # the masks are used to help filter out the indices that were already used for validate data dup_val_indices = np.random.choice(num_duplicate_pairs, num_validation, replace=False) mask_dup = np.ones(num_duplicate_pairs, dtype=bool) mask_dup[dup_val_indices] = 0 nondup_val_indices = np.random.choice(num_duplicate_pairs, num_validation, replace=False) mask_nondup = np.ones(num_duplicate_pairs, dtype=bool) mask_nondup[nondup_val_indices] = 0 val_pairs = np.concatenate((list_duplicate_pairs[dup_val_indices], list_non_duplicate_pairs[nondup_val_indices])) train_pairs = np.concatenate((list_duplicate_pairs[mask_dup], list_non_duplicate_pairs[mask_nondup])) print("validation set size: ", val_pairs.shape, "train set size: ", train_pairs.shape) # Step 4: create the appropriate outputs for the function unique_train_ID_set = set() unique_valid_ID_set = set() train_dup = [] valid_dup = [] for entry in val_pairs: (post1, post2, dup) = entry dup = int(float(dup)) unique_valid_ID_set.add(post1) unique_valid_ID_set.add(post2) valid_dup += [[post1, post2, dup]] for entry in train_pairs: (post1, post2, dup) = entry dup = int(float(dup)) unique_train_ID_set.add(post1) unique_train_ID_set.add(post2) train_dup += [[post1, post2, dup]] print("almost done with speedy data generation, compiling post bodies...") unique_train_ID_list = list(unique_train_ID_set) unique_valid_ID_list = list(unique_valid_ID_set) # NOTE: these variable names don't reflect what is actually going on, so beware... train_post_body_list = [ o.perform_cleaning(o.get_postbody(ID), remove_stopwords=True) for ID in unique_train_ID_list ] valid_post_body_list = [ o.perform_cleaning(o.get_postbody(ID), remove_stopwords=True) for ID in unique_valid_ID_list ] train_post_title_list = [ o.perform_cleaning(o.get_posttitle(ID), remove_stopwords=True) for ID in unique_train_ID_list ] valid_post_title_list = [ o.perform_cleaning(o.get_posttitle(ID), remove_stopwords=True) for ID in unique_valid_ID_list ] train_post_both_list = [ o.perform_cleaning(o.get_post_title_and_body(ID), remove_stopwords=True) for ID in unique_train_ID_list ] valid_post_both_list = [ o.perform_cleaning(o.get_post_title_and_body(ID), remove_stopwords=True) for ID in unique_valid_ID_list ] train_reputation = { ID: (o.get_user_reputation(o.get_postuserid(ID)) if o.get_postuserid(ID) != False else 0) for ID in unique_train_ID_list } valid_reputation = { ID: (o.get_user_reputation(o.get_postuserid(ID)) if o.get_postuserid(ID) != False else 0) for ID in unique_valid_ID_list } train_scores = {ID: o.get_postscore(ID) for ID in unique_train_ID_list} valid_scores = {ID: o.get_postscore(ID) for ID in unique_valid_ID_list} train_collection = DatasetContainer(unique_train_ID_list, train_dup, train_post_body_list, train_post_title_list, train_post_both_list, None, None, None, train_reputation, train_scores) valid_collection = DatasetContainer(unique_valid_ID_list, valid_dup, valid_post_body_list, valid_post_title_list, valid_post_both_list, None, None, None, valid_reputation, valid_scores) return (train_collection, valid_collection)
def load_corpus(): o = qcqa.load_subforum(os.path.join(CORPUS_PATH, CATEGORY + '.zip')) testset, develset, indexset = o.split_for_retrieval() return o, indexset, develset, testset
import query_cqadupstack as qqp xx = qqp.load_subforum("stats.zip") xx.split_for_classification()