def main():
    # TODO: need to agree on a place to put data.
    #subforum = "webmasters"
    for ele in list_of_subforums:
        data_directory = "../data/cqadupstack/" + ele + ".zip"
        o = qcqa.load_subforum(data_directory)

        # Do train/test split
        # Writes output files, takes a minute...
        # TODO: can we make this output the files, outside of the git repo?
        o.split_for_classification()
Ejemplo n.º 2
0
def extract_dataset(name, path_to_zip, output_path):
    print("Loading the %s subforum" % name)
    src = qcqa.load_subforum("%s/%s.zip" % (path_to_zip, name))
    current_path = os.getcwd()
    if not output_path.startswith("/"):
        output_path = "%s/%s" % (current_path, output_path)
    # Remove trailing "/"
    output_path = output_path.rstrip("/")
    try:
        os.stat(output_path)
    except:
        try:
            os.mkdir(output_path)
        except:
            print("output path %s could not be created" % output_path)
            exit(-1)

    full_path = "%s/%s" % (output_path, name)
    try:
        os.stat(full_path)
    except:
        try:
            os.mkdir(full_path)
        except:
            print("output path %s could not be created" % full_path)
            exit(-1)
    output = open("%s/%s_src.pkl" % (full_path, name), 'w+')
    pickle.dump(src, output)
    output.close()

    try:
        os.chdir(full_path)
        print("Splitting %s for classification into output path %s" %
              (name, full_path))
        src.split_for_classification()
    except:
        print("Could not access output path")
        exit(-1)
    os.chdir(current_path)
Ejemplo n.º 3
0
def get_duplicate_pairs(subforum):
    data_directory = "../data/cqadupstack/" + subforum + ".zip"
    o = qcqa.load_subforum(data_directory)

    print("collecting duplicate pairs...")

    # Step 1: get all unique pairs
    duplicate_pairs = set()
    posts_with_duplicates = o.get_posts_with_duplicates()
    for post in posts_with_duplicates:
        duplicates = o.get_duplicates(post)
        for dup_candidate in duplicates:
            min_post = unicode(str(min(int(post), int(dup_candidate))),
                               "utf-8")
            max_post = unicode(str(max(int(post), int(dup_candidate))),
                               "utf-8")
            duplicate_pairs.add((min_post, max_post))

    num_duplicate_pairs = len(duplicate_pairs)
    print("Total number of unique duplicate pairs is:", len(duplicate_pairs))

    return duplicate_pairs, num_duplicate_pairs
Ejemplo n.º 4
0
def extract_test_data(subforum, testflag="small"):
    """
    Converts test data from a particular subforum into a collections container.

    """

    print("Starting extraction of test data...")

    # populate train and valid collections
    data_directory = "../data/cqadupstack/" + subforum + ".zip"
    o = qcqa.load_subforum(data_directory)

    unique_test_ID_set = set()

    test_dup = []
    with open(subforum + "_testpairs_" + testflag + ".txt") as f:
        for line in f:
            data = line.split(" ")
            (post1, post2, dup) = (unicode(data[0],
                                           "utf-8"), unicode(data[1], "utf-8"),
                                   unicode(data[2], "utf-8"))
            unique_test_ID_set.add(post1)
            unique_test_ID_set.add(post2)
            test_dup += [[post1, post2, dup]]

    unique_test_ID_list = list(unique_test_ID_set)

    # Makes list of post bodies for test data
    test_post_body_list = [
        o.perform_cleaning(o.get_postbody(ID), remove_stopwords=True)
        for ID in unique_test_ID_list
    ]
    test_post_title_list = [
        o.perform_cleaning(o.get_posttitle(ID), remove_stopwords=True)
        for ID in unique_test_ID_list
    ]
    test_post_both_list = [
        o.perform_cleaning(o.get_post_title_and_body(ID),
                           remove_stopwords=True) for ID in unique_test_ID_list
    ]
    test_reputation = {
        ID: (o.get_user_reputation(o.get_postuserid(ID))
             if o.get_postuserid(ID) != False else 0)
        for ID in unique_test_ID_list
    }
    test_scores = {ID: o.get_postscore(ID) for ID in unique_test_ID_list}

    test_collection = DatasetContainer(unique_test_ID_list, test_dup,
                                       test_post_body_list,
                                       test_post_title_list,
                                       test_post_both_list, None, None, None,
                                       test_reputation, test_scores)

    # fill in tf matrices
    test_tf_body, feature_names = vectorize(test_collection.bodies)
    test_tf_title, feature_names = vectorize(test_collection.titles)
    test_tf_both, feature_names = vectorize(test_collection.combined)

    test_collection.tf_body = test_tf_body
    test_collection.tf_title = test_tf_title
    test_collection.tf_combined = test_tf_both

    return test_collection
Ejemplo n.º 5
0
def construct_train_valid(subforum):
    """Constructs the train and valid sets.

    Keyword arguments:
    subforum -- the subforum to construct the datasets if None, uses all subforum
    """
    #TODO: collect all subforums if subforum=None
    data_directory = "../data/cqadupstack/" + subforum + ".zip"
    o = qcqa.load_subforum(data_directory)

    print("starting speedy data generation...")

    # Step 1: get all unique pairs
    duplicate_pairs, num_duplicate_pairs = get_duplicate_pairs(subforum)

    # Step 2: randomly sample pairs of posts that are not duplicates. Idea is to
    # have roughly a 1:1 ratio of dup to non-dup posts
    non_duplicate_pairs = set()
    while len(non_duplicate_pairs) < num_duplicate_pairs:
        (post1, post2, tag) = o.get_random_pair_of_posts()
        if tag == 'nondup':
            min_post = unicode(str(min(int(post1), int(post2))), "utf-8")
            max_post = unicode(str(max(int(post1), int(post2))), "utf-8")
            non_duplicate_pairs.add((min_post, max_post))

    print("Total number of non-duplicate pairs is:", len(non_duplicate_pairs))

    # Step 3: generate train/validation splits

    # randomly choose 1/10th of the data to be the validation
    num_validation = num_duplicate_pairs / 10
    num_train = num_duplicate_pairs - num_validation

    print("validate with: ", num_validation, "train with: ", num_train)

    list_duplicate_pairs = np.array(list(duplicate_pairs))
    list_duplicate_pairs = np.concatenate(
        (list_duplicate_pairs, np.ones(shape=(num_duplicate_pairs, 1))),
        axis=1)
    list_non_duplicate_pairs = np.array(list(non_duplicate_pairs))
    list_non_duplicate_pairs = np.concatenate(
        (list_non_duplicate_pairs, np.zeros(shape=(num_duplicate_pairs, 1))),
        axis=1)

    # Choosing the validation indices (for dup, non-dup)
    # the masks are used to help filter out the indices that were already used for validate data
    dup_val_indices = np.random.choice(num_duplicate_pairs,
                                       num_validation,
                                       replace=False)
    mask_dup = np.ones(num_duplicate_pairs, dtype=bool)
    mask_dup[dup_val_indices] = 0

    nondup_val_indices = np.random.choice(num_duplicate_pairs,
                                          num_validation,
                                          replace=False)
    mask_nondup = np.ones(num_duplicate_pairs, dtype=bool)
    mask_nondup[nondup_val_indices] = 0

    val_pairs = np.concatenate((list_duplicate_pairs[dup_val_indices],
                                list_non_duplicate_pairs[nondup_val_indices]))
    train_pairs = np.concatenate((list_duplicate_pairs[mask_dup],
                                  list_non_duplicate_pairs[mask_nondup]))

    print("validation set size: ", val_pairs.shape, "train set size: ",
          train_pairs.shape)

    # Step 4: create the appropriate outputs for the function
    unique_train_ID_set = set()
    unique_valid_ID_set = set()
    train_dup = []
    valid_dup = []

    for entry in val_pairs:
        (post1, post2, dup) = entry
        dup = int(float(dup))
        unique_valid_ID_set.add(post1)
        unique_valid_ID_set.add(post2)
        valid_dup += [[post1, post2, dup]]
    for entry in train_pairs:
        (post1, post2, dup) = entry
        dup = int(float(dup))
        unique_train_ID_set.add(post1)
        unique_train_ID_set.add(post2)
        train_dup += [[post1, post2, dup]]

    print("almost done with speedy data generation, compiling post bodies...")

    unique_train_ID_list = list(unique_train_ID_set)
    unique_valid_ID_list = list(unique_valid_ID_set)

    # NOTE: these variable names don't reflect what is actually going on, so beware...
    train_post_body_list = [
        o.perform_cleaning(o.get_postbody(ID), remove_stopwords=True)
        for ID in unique_train_ID_list
    ]
    valid_post_body_list = [
        o.perform_cleaning(o.get_postbody(ID), remove_stopwords=True)
        for ID in unique_valid_ID_list
    ]

    train_post_title_list = [
        o.perform_cleaning(o.get_posttitle(ID), remove_stopwords=True)
        for ID in unique_train_ID_list
    ]
    valid_post_title_list = [
        o.perform_cleaning(o.get_posttitle(ID), remove_stopwords=True)
        for ID in unique_valid_ID_list
    ]

    train_post_both_list = [
        o.perform_cleaning(o.get_post_title_and_body(ID),
                           remove_stopwords=True)
        for ID in unique_train_ID_list
    ]
    valid_post_both_list = [
        o.perform_cleaning(o.get_post_title_and_body(ID),
                           remove_stopwords=True)
        for ID in unique_valid_ID_list
    ]

    train_reputation = {
        ID: (o.get_user_reputation(o.get_postuserid(ID))
             if o.get_postuserid(ID) != False else 0)
        for ID in unique_train_ID_list
    }
    valid_reputation = {
        ID: (o.get_user_reputation(o.get_postuserid(ID))
             if o.get_postuserid(ID) != False else 0)
        for ID in unique_valid_ID_list
    }

    train_scores = {ID: o.get_postscore(ID) for ID in unique_train_ID_list}
    valid_scores = {ID: o.get_postscore(ID) for ID in unique_valid_ID_list}

    train_collection = DatasetContainer(unique_train_ID_list, train_dup,
                                        train_post_body_list,
                                        train_post_title_list,
                                        train_post_both_list, None, None, None,
                                        train_reputation, train_scores)
    valid_collection = DatasetContainer(unique_valid_ID_list, valid_dup,
                                        valid_post_body_list,
                                        valid_post_title_list,
                                        valid_post_both_list, None, None, None,
                                        valid_reputation, valid_scores)

    return (train_collection, valid_collection)
Ejemplo n.º 6
0
def load_corpus():
    o = qcqa.load_subforum(os.path.join(CORPUS_PATH, CATEGORY + '.zip'))
    testset, develset, indexset = o.split_for_retrieval()
    return o, indexset, develset, testset
Ejemplo n.º 7
0
import query_cqadupstack as qqp

xx = qqp.load_subforum("stats.zip")

xx.split_for_classification()