Example #1
0
def main():
    logger.info("Start coreference parsing")
    parser = ArgumentParser()
    parser.add_argument('--htmls_fname', type=str, required=True)
    parser.add_argument('--objects_fname', type=str, required=True)
    parser.add_argument('--htmls_coref_cache', type=str, required=True)
    parser.add_argument('--work_dir',
                        type=str,
                        required=False,
                        default=os.getcwd())
    args = parser.parse_args()
    work_dir = args.work_dir
    set_up_root_logger('COREF', os.path.join(work_dir, 'logs'))

    html_fname: str = args.htmls_fname
    objects_path = Path(args.objects_fname)
    htmls_coref_cache_fname: str = args.htmls_coref_cache

    with open(html_fname, "rb") as f_html:
        htmls_lookup = pickle.load(f_html)

    htmls_lookup_coref = load_cache(htmls_coref_cache_fname)

    names = get_all_objects(objects_path, work_dir)
    logger.info(f'Number of objects: {len(names)}')

    spacy.require_gpu()
    nlp = spacy.load('en_core_web_sm')
    neuralcoref.add_to_pipe(nlp)

    find_corefs(htmls_coref_cache_fname, htmls_lookup, htmls_lookup_coref,
                names, nlp)

    with open(htmls_coref_cache_fname, 'wb') as f:
        pickle.dump(htmls_lookup_coref, f, pickle.HIGHEST_PROTOCOL)

    logger.info('Finished')
Example #2
0
import yaml
from box import Box
from learning_sizes_evaluation.evaluate import coverage_accuracy_relational, RelationalResult
from logging_setup_dla.logging import set_up_root_logger
from matplotlib import pyplot as plt, colors, cm
from matplotlib.scale import SymmetricalLogTransform
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import Ridge
from visual_size_comparison.config import VisualConfig
from visual_size_comparison.propagation import build_cooccurrence_graph, Pair, VisualPropagation

from breds.breds_inference import find_similar_words, BackoffSettings, comparison_dev_set
from breds.config import Config, load_word2vec

set_up_root_logger(f'INFERENCE_VISUAL_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.path.join(os.getcwd(), 'logs'))
logger = logging.getLogger(__name__)


def main():
    with open("config.yml", "r") as ymlfile:
        cfg = Box(yaml.safe_load(ymlfile))
        # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None)

    test_pairs, unseen_objects = comparison_dev_set(cfg)
    unseen_objects = [o.replace('_', " ") for o in unseen_objects]

    # TODO check whether the objects aren't in the bootstrapped objects
    visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)
Example #3
0
from size_comparisons.inference.baseline_numeric_gaussians import find_confidences_for_pairs_lazy, \
    BaselineNumericGaussians
from size_comparisons.parse_objects import InputsParser
from size_comparisons.scraping.compilation import fill_dataframe
import logging
from datetime import datetime
from logging_setup_dla.logging import set_up_root_logger
import os

set_up_root_logger(f'SEEDLAZY_{datetime.now().strftime("%d%m%Y%H%M%S")}',
                   os.getcwd())

logger = logging.getLogger(__name__)


def main():
    input_parser = InputsParser()
    labels = input_parser.retrieve_labels()
    names = input_parser.retrieve_names()
    data = fill_dataframe(names, labels)
    test_pairs = input_parser.retrieve_test_pairs()
    test_pairs_tuples = list(
        test_pairs.itertuples(name='TestPair', index=False))
    find_confidences_for_pairs_lazy(data, test_pairs_tuples)


if __name__ == "__main__":
    try:
        main()
    except Exception:
        logger.exception("Unhandled exception")
Example #4
0
from box import Box
import numpy as np
from logging_setup_dla.logging import set_up_root_logger
from pandas import DataFrame
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from visual_size_comparison.config import VisualConfig
from visual_size_comparison.propagation import build_cooccurrence_graph, Pair, VisualPropagation

from breds.config import Config
from sklearn.svm import SVC, LinearSVC
from matplotlib import pyplot as plt
from learning_sizes_evaluation.evaluate import precision_recall, range_distance, Result

set_up_root_logger(f'RANGES_{datetime.now().strftime("%d%m%Y%H%M%S")}',
                   os.path.join(os.getcwd(), 'logs'))
logger = logging.getLogger(__name__)


def iterativily_find_size(lower_bounds_sizes, upper_bounds_sizes):
    l = lower_bounds_sizes.copy()
    u = upper_bounds_sizes.copy()
    total_objects = len(u) + len(l)
    size_scale = len(l) / total_objects
    logger.info(f'Scale: {size_scale}')
    count_l = 0
    count_r = 0
    while len(u) >= 1 and len(l) >= 1 and max(l) > min(u):
        r = random.random()
        if r < size_scale:
            l.remove(max(l))
Example #5
0
import fileinput
import logging
import os
from collections import namedtuple
from math import ceil, floor

import pandas as pd
from logging_setup_dla.logging import set_up_root_logger
from matplotlib.scale import SymmetricalLogTransform
from matplotlib import pyplot as plt
import numpy as np

set_up_root_logger('ANALYZETEST', os.path.join(os.getcwd(), 'logs'))
logger = logging.getLogger(__name__)


def main():
    anouk: pd.DataFrame = pd.read_csv('data_numeric/VG_YOLO_intersection_test_annotated_anouk.csv')
    anouk: pd.DataFrame = anouk.astype({'object': str})
    anouk.set_index(['object'], inplace=True, drop=False)
    bram: pd.DataFrame = pd.read_csv('data_numeric/VG_YOLO_intersection_test_annotated_bram.csv')
    bram: pd.DataFrame = bram.astype({'object': str})
    bram.set_index(['object'], inplace=True, drop=False)
    assert len(anouk.keys()) == len(bram.keys())

    bram_no_size = [line.strip().lower() for line in fileinput.input('data_numeric/hard_words_bram.txt')]
    anouk_no_size = [line.strip().lower() for line in fileinput.input('data_numeric/hard_words_anouk.txt')]
    remove = ['snow', 'architecture','toilet water']

    Result = namedtuple('Result', ['object', 'min', 'max'])
    results = list()
Example #6
0
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from logging_setup_dla.logging import set_up_root_logger
from matplotlib import colors, cm
from scipy import stats
from size_comparisons.exploration.explore_infoboxes import Record, search_infoboxes
from size_comparisons.parse_objects import InputsParser
from size_comparisons.scraping.analyze import retrieve_synset

SAMPLE = False

set_up_root_logger('INFOBOXES', os.getcwd())
logger = logging.getLogger(__name__)

def main():
    # data = pd.read_csv('D:\GitHubD\size-comparisons\data\manually_selected.csv')
    # objects = data['object']
    inputparser = InputsParser()
    names = inputparser.retrieve_names()
    # names = [line.strip() for line in fileinput.input('D:\GitHubD\size-comparisons\examples\exploration\VisualGenome_REFORMAT.txt')]
    labels = inputparser.retrieve_labels()
    fname_records = 'records.pkl'
    if os.path.exists(fname_records):
        with open(fname_records, 'rb') as f:
            records = pickle.load(f)
    else:
        records = [Record(name, labels[i]) for i, name in enumerate(names)]
from size_comparisons.parse_objects import InputsParser

from size_comparisons.scraping.analyze import retrieve_synset

import logging
from datetime import datetime
from logging_setup_dla.logging import set_up_root_logger
import os

set_up_root_logger(f'ERRORANALYSIS_{datetime.now().strftime("%d%m%Y%H%M%S")}',
                   os.getcwd())

logger = logging.getLogger(__name__)

PRINT_HTML = False

analyzed_terms = ['n01581984']


def main():
    input_parser = InputsParser()
    if PRINT_HTML:
        htmls = input_parser.retrieve_google_results_html()
    synset_names = [retrieve_synset(label)._name for label in analyzed_terms]
    regex_sizes = input_parser.retrieve_regex_scraper_sizes()
    regex_contexts = input_parser.retrieve_regex_scraper_contexts()
    for i, term in enumerate(analyzed_terms):
        logger.info(term)
        logger.info(synset_names[i])
        if PRINT_HTML:
            with open('htmls.txt', 'w') as f:
import pickle

from size_comparisons.inference.baseline_numeric_gaussians import BaselineNumericGaussians
from size_comparisons.parse_objects import InputsParser
from size_comparisons.scraping.compilation import fill_dataframe
import logging
from datetime import datetime
from logging_setup_dla.logging import set_up_root_logger
import os

set_up_root_logger(f'NUMERIC_SEED_{datetime.now().strftime("%d%m%Y%H%M%S")}',
                   os.getcwd())

logger = logging.getLogger(__name__)

selected = [
    'tiger', 'insect', 'ocean', 'cat', 'dog', 'crown', 'neuropteron',
    'diving suit', 'light-emitting diode', 'stone'
]


def main():
    input_parser = InputsParser()
    labels = input_parser.retrieve_labels()
    names = input_parser.retrieve_names()
    data = fill_dataframe(names, labels)
    # mask = data['name'].isin(selected)
    # data = data[mask]
    baseline = BaselineNumericGaussians(data)
    baseline.fill_adjacency_matrix()
    pickle.dump(baseline, open(input_parser.data_dir / 'baseline.p', 'rb'))
Example #9
0
import logging
import os

import pandas as pd
import tqdm
from logging_setup_dla.logging import set_up_root_logger
from sklearn.metrics import precision_score, recall_score

from reject_abstract_objects.reject import check_abstract

set_up_root_logger(f'reject', os.path.join(os.getcwd(), 'logs'))

logger = logging.getLogger(__name__)

THE = True


def main():
    """Run an example of the abstract word rejection system."""
    input = pd.read_csv('test_annotated.csv')
    input = input[input['abstract'] != -1]

    logger.info(f"Percentage abstract: {input['abstract'].mean()}")

    y_true = list(input['abstract'])
    y_pred = list()
    no_results = set()
    for entity in tqdm.tqdm(list(input['object'])):
        res, no_result = check_abstract(entity, the=THE)
        y_pred.append(res)
        if no_result:
Example #10
0
import json
from argparse import ArgumentParser

from size_comparisons.parse_objects import InputsParser
from size_comparisons.scraping.frequencies_wikipedia import find_frequencies_wikipedia
import logging
from datetime import datetime
from logging_setup_dla.logging import set_up_root_logger
import os

set_up_root_logger(f'FREQSWIKI_{datetime.now().strftime("%d%m%Y%H%M%S")}',
                   os.getcwd())

logger = logging.getLogger(__name__)


def main():
    """Retrieve frequencies from a wikipedia Lucene index."""
    parser = ArgumentParser()
    parser.add_argument('--index', type=str, required=True)
    args = parser.parse_args()
    index_dir = args.index
    inputparser = InputsParser()
    names = inputparser.retrieve_names()
    fname = inputparser.data_dir / 'frequencies_wikipedia.json'
    freqs = find_frequencies_wikipedia(names, index_dir)

    with open(fname, 'w') as wf:
        json.dump(freqs, wf)

Example #11
0
import logging
import os
import pickle

from logging_setup_dla.logging import set_up_root_logger
from nltk import tokenize
import numpy as np

set_up_root_logger(f'HTMLSSTATS', os.path.join(os.getcwd(), 'logs'))
logger = logging.getLogger(__name__)


def main():

    with open('htmls.pkl', "rb") as f_html:
        results: dict = pickle.load(f_html)

    sizes = []
    for htmls in results.values():
        for html in htmls:
            words = tokenize.word_tokenize(html)
            sizes.append(len(words))

    logger.info(f'Mean doc size: {np.mean(sizes)}')
    logger.info(f'Median doc size: {np.median(sizes)}')


if __name__ == "__main__":
    try:
        main()
    except Exception:
import os
import pickle
import random

import numpy as np
import pandas as pd

import yaml
from box import Box
from learning_sizes_evaluation.evaluate import coverage_accuracy_relational, RelationalResult
from learning_sizes_evaluation.monte_carlo_permutation_test import permutation_test
from logging_setup_dla.logging import set_up_root_logger

from breds.breds_inference import comparison_dev_set

set_up_root_logger(f'COMBINE', os.path.join(os.getcwd(), 'logs'))
logger = logging.getLogger(__name__)


def get_result(golds, preds, tag, notes):
    coverage, selectivity = coverage_accuracy_relational(golds,
                                                         preds,
                                                         notes=notes)
    logger.info(f'Coverage: {coverage}')
    logger.info(f'selectivity: {selectivity}')

    return RelationalResult(tag, selectivity, coverage)


def random_combination(visual, linguistic):
    """Use a random combination of the linguistic and visual system."""