def transcribe_labels(results_fname):
    """
    Read the results stored under the crowdflower subdir of the data dir
    named by results_fname, and interpret the annotations into labels.  Then,
    write those labels out into the relational-nouns subdir of the data dir
    within a subsubdir that has the same name as the results_fname.  Record teh
    results in three separate files -- based on what source the word was drawn
    from.
    """

    # Work out paths
    results_path = os.path.join(DATA_DIR, 'crowdflower', results_fname)
    result_fname_no_ext = results_fname.rsplit('.', 1)[0]
    labels_dir = os.path.join(DATA_DIR, 'relational-nouns', result_fname_no_ext)
    t4k.ensure_exists(labels_dir)

    # Read in the results, and interpret labels
    crowdflower_results = t4k.CrowdflowerResults(
        results_path, lambda x:x['data']['token'])
    word_labels = interpret_annotations_by_source(crowdflower_results)

    # Write labels to disk, with words coming from different sources put into
    # different files.
    for source in word_labels:
        source_label_file = open(os.path.join(labels_dir, source + '.tsv'), 'w')
        for word, label in word_labels[source].iteritems():
            source_label_file.write(word + '\t' + label + '\n')
Esempio n. 2
0
def extract_features_from_parc(num_processes=NUM_PROCESSES,
                               limit=pd.MAX_ARTICLE_NUM,
                               out_path=None):

    # Resolve the path to the results file and open it for writing.
    if out_path is None:
        out_dir = os.path.join(DATA_DIR, 'parc-verifiability')
        t4k.ensure_exists(out_dir)
        out_path = os.path.join(out_dir, 'features.np')
    out_file = open(out_path, 'w')

    # Make a queueu so that workers can send results back
    results_queue = iq.IterableQueue()

    # Start a bunch of workers
    for proc_num in range(num_processes):
        p = multiprocessing.Process(target=extract_features_from_parc_worker,
                                    args=(results_queue.get_producer(),
                                          proc_num, num_processes, limit))
        p.start()

    # Get an endpoint to collect the work, then close the queue since we won't
    # make any more endpoints
    results_consumer = results_queue.get_consumer()
    results_queue.close()

    # Collect all the incoming work from the workers
    all_result_vectors = []
    for result_vectors in results_consumer:
        all_result_vectors.extend(result_vectors)

    # Turn all the results vectors into a single pandas dataframe, and save it
    use_headers = headers[:1] + headers[2:]
    data_frame = pandas.DataFrame(all_result_vectors, columns=use_headers)
    pickle.dump(data_frame, out_file)
Esempio n. 3
0
def extract_features_from_parc_file(article_num):
    parc_features_dir = os.path.join(DATA_DIR, 'parc-verifiability',
                                     'features')
    t4k.ensure_exists(parc_features_dir)
    corenlp_path, parc_path, raw_path = pd.get_article_paths(article_num)
    out_vector_path = os.path.join(parc_features_dir,
                                   pd.get_article_features_path(article_num))
    return extract_features(corenlp_path, parc_path, raw_path, out_vector_path)
Esempio n. 4
0
def do_generate_candidates1():

    # Decide the output path and the number of positive candidates to find
    t4k.ensure_exists(CANDIDATES_DIR)
    out_path = os.path.join(CANDIDATES_DIR, 'candidates1.txt')
    num_to_generate = 1000

    # Read in the seed set, which is the basis for the model that selects new 
    # candidates
    pos, neg, neut = utils.get_full_seed_set()

    # Don't keep any candidates that were already in the seed set
    exclude = pos | neg | neut

    generate_candidates.generate_candidates(
        num_to_generate, out_path, pos, neg, exclude)
Esempio n. 5
0
def do_generate_candidates_iteration(iteration=2, kernel=None, features=None):

    # Work out the file names
    candidates_fname = 'candidates%d.txt' % iteration
    random_candidates_fname = 'random_candidates%d.txt' % iteration
    previous_results_fname = 'results%d.json' % (iteration-1)
    previous_labels_dirname = 'results%d' % (iteration-1)
    previous_task_fnames = ['task%d.csv' % j for j in range(1,iteration)]

    # Decide the output path and the number of positive candidates to find
    t4k.ensure_exists(CANDIDATES_DIR)
    out_path = os.path.join(CANDIDATES_DIR, candidates_fname)
    random_out_path = os.path.join(CANDIDATES_DIR, random_candidates_fname)
    num_to_generate = 1000

    # Read in the seed set, which is the basis for the model that selects new 
    # candidates
    pos, neg, neut = utils.get_full_seed_set()
    exclude = pos | neg | neut

    # Read in the labelled data inside the first set of results
    transcribe_labels(previous_results_fname)
    add_pos, add_neg, add_neut = utils.read_all_labels(os.path.join(
        DATA_DIR, 'relational-nouns', previous_labels_dirname))

    # Add in these nouns to the seeds
    pos.update(add_pos)
    neg.update(add_neg)
    neut.update(add_neut)

    # Don't keep any candidates that were already in the seed set or previously
    # loaded questions
    for task_fname in previous_task_fnames:
        task_path = os.path.join(CROWDFLOWER_DIR, task_fname)
        reader = csv.DictReader(open(task_path))
        exclude.update([row['token'] for row in reader])

    ## Generate the non-random candidates, enabling enrichment of positives
    #generate_candidates.generate_candidates_ordinal(
    #    num_to_generate, out_path, pos, neg, neut, exclude, kernel, features)

    # Generate random candidates, enabling exploration and model testing.
    random_candidates_path = os.path.join(
        CANDIDATES_DIR, random_candidates_fname)
    generate_candidates.generate_random_candidates(2000, random_out_path)
def make_crowdflower_csv(iteration=2):

    # Seed randomness for reproducibility
    random.seed(0)

    # Open a file at which to write the csv file
    t4k.ensure_exists(CROWDFLOWER_DIR)
    task_fname = 'task%d.csv' % iteration
    csv_path = os.path.join(CROWDFLOWER_DIR, task_fname)
    csv_f = open(csv_path, 'w')

    # First read the scored candidates
    pos_common_candidates = []
    neg_common_candidates = []
    neut_common_candidates = []
    candidates_fname = 'candidates%d.txt' % iteration
    for line in open(os.path.join(CANDIDATES_DIR, candidates_fname)):
        token, class_ = line.split('\t')[:2]
        if class_ == '+':
            pos_common_candidates.append(token)
        elif class_ == '-':
            neg_common_candidates.append(token)
        elif class_ == '0':
            neut_common_candidates.append(token)
        else:
            raise ValueError('Unexpected classification character: %s' % class_)

    # We'll only keep the first 1000 negatives.
    positives = pos_common_candidates[:1000]
    neutrals = neut_common_candidates[:1000]
    negatives = neg_common_candidates[:1000]

    #num_neut = min(250, len(neut_common_candidates))
    #neg_common_candidates = neg_common_candidates[:500-num_neut]
    #neut_common_candidates = neut_common_candidates[:num_neut]

    # Next read the random candidates
    random_candidates_fname = 'random_candidates%d.txt' % iteration
    random_candidates_path = os.path.join(
        CANDIDATES_DIR, random_candidates_fname)
    random_candidates = open(random_candidates_path).read().strip().split('\n')
    random_candidates[:2000]

    # Collect all the candidate words together and elminate dupes
    all_candidates = set(positives + negatives + neutrals + random_candidates)

    # Now keep track of why each word was included (i.e. was it a word labelled
    # by the classifier-to-date as positive? negative? or was it randomly 
    # sampled?  Note that a word could be both randomly drawn and labelled.
    positives = set(positives)
    negatives = set(negatives)
    neutrals = set(neutrals)
    random_candidates = set(random_candidates)
    sourced_candidates = []
    for candidate in all_candidates:
        sources = []
        if candidate in pos_common_candidates:
            sources.append('pos2')
        if candidate in neg_common_candidates:
            sources.append('neg2')
        if candidate in neut_common_candidates:
            sources.append('neut2')
        if candidate in random_candidates:
            sources.append('rand2')
        sourced_candidates.append((candidate, ':'.join(sources)))

    # randomize the ordering
    random.shuffle(sourced_candidates)

    # Write a csv file with the candidate words in it
    writer = csv.writer(csv_f)
    writer.writerow(['token', 'source'])
    writer.writerows(sourced_candidates)