Ejemplo n.º 1
0
def main():
    """Parse the sizes using regex from google search results."""
    with open("config_scraper.yml", "r") as ymlfile:
        cfg = Box(yaml.safe_load(ymlfile))
    inputparser = InputsParser()
    input: DataFrame = pd.read_csv(cfg.path.objects)
    input = input.astype({'object': str})
    objects = list(input['object'])
    logger.info(f'objects: {objects}')

    lookups_wrapper = None
    if USE_WIKI:
        lookups_wrapper = inputparser.retrieve_wikipedia_lookups()
    with open(cfg.path.htmls_cache, 'rb') as f:
        htmls_lookup: dict = pickle.load(f)

    fname = inputparser.data_dir / 'regex_sizes.p'
    fname_contexts = inputparser.data_dir / 'regex_contexts.p'
    sizes_lookup, results_contexts = parse_documents_for_lengths(
        objects, htmls_lookup, lookups_wrapper=lookups_wrapper)
    pickle.dump(sizes_lookup, open(fname, 'wb'))
    pickle.dump(results_contexts, open(fname_contexts, 'wb'))
    logger.info(sizes_lookup)

    point_predictions = dict()
    for o in objects:
        mean = predict_size_regex(o, sizes_lookup)
        point_predictions[o] = mean

    logger.info(point_predictions)
    with open(f'regex_predictions.pkl', 'wb') as f:
        pickle.dump(point_predictions, f)

    precision_recall(input, point_predictions)
    range_distance(input, point_predictions)
Ejemplo n.º 2
0
def main():
    """"""
    inputparser = InputsParser()
    names = inputparser.retrieve_names()
    labels = inputparser.retrieve_labels()
    wiki_lookups = retrieve_wikipedia_pages(names, labels)
    pickle.dump(wiki_lookups, open(inputparser.data_dir / 'wikipedia_lookups.p', 'wb'))
Ejemplo n.º 3
0
def main():
    """Retrieve the html pages for the urls in the google search results."""
    parser = ArgumentParser()
    parser.add_argument('--datadir', default=None, type=str)
    args = parser.parse_args()
    data_path = None
    if args.datadir is not None:
        data_path = Path(args.datadir)
    inputparser = InputsParser(data_dir=data_path)
    labels = inputparser.retrieve_labels()
    fname = 'google_results_html.p'
    file_path = inputparser.data_dir / fname
    logger.info(f'Will save result at {file_path}')
    urls = inputparser.retrieve_google_urls()
    loop = asyncio.get_event_loop()
    htmls_lookup = html_scraper.create_or_update_urls_html(labels, urls, loop)
    for i in range(2):
        try:
            logger.info("Try saving the results")
            with open(file_path, 'wb') as f:
                pickle.dump(htmls_lookup, f, pickle.HIGHEST_PROTOCOL)
            logger.info("Saved")
        except PermissionError:
            wait = 300.
            logger.info(f"Received permissionerror, wait {wait} seconds before retry")
            time.sleep(wait)
            continue
        break
Ejemplo n.º 4
0
def main():
    input_parser = InputsParser()
    labels = input_parser.retrieve_labels()
    names = input_parser.retrieve_names()
    data = fill_dataframe(names, labels)
    test_pairs = input_parser.retrieve_test_pairs()
    test_pairs_tuples = list(
        test_pairs.itertuples(name='TestPair', index=False))
    find_confidences_for_pairs_lazy(data, test_pairs_tuples)
Ejemplo n.º 5
0
def load_and_update_baseline(data_dir=None) -> BaselineNumericGaussians:
    """Load numeric graph and update distance matrix."""
    input_parser = InputsParser(data_dir=data_dir)
    labels = input_parser.retrieve_labels()
    names = input_parser.retrieve_names()
    data = fill_dataframe(names, labels, datadir=data_dir)
    matrix = input_parser.load_adjacency_matrix()
    baseline = BaselineNumericGaussians(data, matrix=matrix)
    baseline.update_distance_matrix()
    return baseline
def main():
    """Retrieve google search results for all object lengths."""
    inputparser = InputsParser()
    names = inputparser.retrieve_names()
    queries = [[f'{name} length'] for name in names]

    labels = inputparser.retrieve_labels()
    fname = 'google_urls.p'
    file_path = inputparser.data_dir / fname
    create_or_update_results(file_path, queries, labels)
def main():
    input_parser = InputsParser()
    labels = input_parser.retrieve_labels()
    names = input_parser.retrieve_names()
    data = fill_dataframe(names, labels)
    # mask = data['name'].isin(selected)
    # data = data[mask]
    baseline = BaselineNumericGaussians(data)
    baseline.fill_adjacency_matrix()
    pickle.dump(baseline, open(input_parser.data_dir / 'baseline.p', 'rb'))
Ejemplo n.º 8
0
def main():
    # data = pd.read_csv('D:\GitHubD\size-comparisons\data\manually_selected.csv')
    # objects = data['object']
    inputparser = InputsParser()
    names = inputparser.retrieve_names()
    # names = [line.strip() for line in fileinput.input('D:\GitHubD\size-comparisons\examples\exploration\VisualGenome_REFORMAT.txt')]
    labels = inputparser.retrieve_labels()
    fname_records = 'records.pkl'
    if os.path.exists(fname_records):
        with open(fname_records, 'rb') as f:
            records = pickle.load(f)
    else:
        records = [Record(name, labels[i]) for i, name in enumerate(names)]
        random.seed(41)
        if SAMPLE:
            records = random.sample(records, 50)
        del names
        del labels

        for record in records:
            synset = retrieve_synset(record.label)
            record.category = synset.lexname()

        lexnames = [record.category for record in records]
        pd.Series(lexnames).value_counts().plot(kind='bar')
        plt.xticks(rotation=90)
        plt.show()

        for record in tqdm.tqdm(records):
            search_infoboxes(record)

        with open(fname_records, 'wb') as f:
            pickle.dump(records, f, pickle.HIGHEST_PROTOCOL)
    ngram_count_lookup = inputparser.retrieve_frequencies()
    for record in records:
        try:
            count = int(ngram_count_lookup[record.name])
            record.count = count
        except KeyError:
            record.count = None
            continue

    logger.info(f'Number of records: {len(records)}')
    data_dict = dict()
    data_dict['names'] = [record.name for record in records]
    data_dict['height'] = [record.height for record in records]
    data_dict['size'] = [record.size for record in records]
    data_dict['length'] = [record.length for record in records]
    data_dict['category'] = [record.category for record in records]
    data_dict['count'] = [record.count for record in records]
    df = pd.DataFrame(data=data_dict)
    df = df.dropna()
    df['any'] = (df['height']) | (df['size']) | (df['length'])

    plot_results(df, inputparser)
Ejemplo n.º 9
0
def main():
    """Retrieve frequencies from a wikipedia Lucene index."""
    parser = ArgumentParser()
    parser.add_argument('--index', type=str, required=True)
    args = parser.parse_args()
    index_dir = args.index
    inputparser = InputsParser()
    names = inputparser.retrieve_names()
    fname = inputparser.data_dir / 'frequencies_wikipedia.json'
    freqs = find_frequencies_wikipedia(names, index_dir)

    with open(fname, 'w') as wf:
        json.dump(freqs, wf)
def main():
    input_parser = InputsParser()
    if PRINT_HTML:
        htmls = input_parser.retrieve_google_results_html()
    synset_names = [retrieve_synset(label)._name for label in analyzed_terms]
    regex_sizes = input_parser.retrieve_regex_scraper_sizes()
    regex_contexts = input_parser.retrieve_regex_scraper_contexts()
    for i, term in enumerate(analyzed_terms):
        logger.info(term)
        logger.info(synset_names[i])
        if PRINT_HTML:
            with open('htmls.txt', 'w') as f:
                f.writelines(htmls[term])
        else:
            logger.info(regex_contexts[term])
        logger.info(regex_sizes[term])
Ejemplo n.º 11
0
def fill_dataframe(names: list,
                   labels: list,
                   remove_outliers=True,
                   remove_zeroes=True,
                   datadir: str = None,
                   use_wikipedia=False):
    """Compile a dataframe of scraped data for further analysis."""
    # IMPORT DATA
    input_parser = InputsParser(data_dir=datadir)
    potential_fname = input_parser.data_dir / "parsed_data.pkl"
    if os.path.exists(potential_fname):
        logger.info('LOADING CACHED DATAFRAME')
        return pd.read_pickle(potential_fname)
    ngram_count_lookup = input_parser.retrieve_frequencies()
    counts_wikipedia = input_parser.retrieve_frequencies(wikipedia=True)
    wiki_lookup_wrapper = input_parser.retrieve_wikipedia_lookups()
    sizes_lookup = input_parser.retrieve_regex_scraper_sizes()
    results = []
    envelope_errors = 0

    for i in tqdm.trange(len(labels)):
        # Get name and label
        name = names[i]
        label = labels[i]

        # Retrieve synset
        # synset = retrieve_synset(label)

        # Wikipedia entry
        lookup = wiki_lookup_wrapper.lookup(label)
        exists = lookup.exists()

        disambiguation = is_disambiguation(lookup)

        sizes = sizes_lookup[label]
        e, sizes = clean_sizes(remove_outliers, remove_zeroes, sizes)
        envelope_errors += e

        n_data_points = len(sizes)
        mean, std = mean_and_std(sizes)
        # plot_sizes_with_gaussian(sizes, name)

        # Add ngram count
        count = None
        if name in ngram_count_lookup.keys():
            count = int(ngram_count_lookup[name])

        count_wiki = None
        if name in counts_wikipedia.keys():
            count_wiki = int(counts_wikipedia[name])

        n = check_n(name)
        entry = Entry(label, name, exists, disambiguation, count, count_wiki,
                      n, sizes, mean, std, n_data_points)
        results.append(entry)

    if envelope_errors > 0:
        logger.info(
            f"WARNING: {envelope_errors} value errors while removing outliers")
    data = pd.DataFrame(results)
    data.to_pickle(potential_fname)
    return data