def main(): logger.info("Start coreference parsing") parser = ArgumentParser() parser.add_argument('--htmls_fname', type=str, required=True) parser.add_argument('--objects_fname', type=str, required=True) parser.add_argument('--htmls_coref_cache', type=str, required=True) parser.add_argument('--work_dir', type=str, required=False, default=os.getcwd()) args = parser.parse_args() work_dir = args.work_dir set_up_root_logger('COREF', os.path.join(work_dir, 'logs')) html_fname: str = args.htmls_fname objects_path = Path(args.objects_fname) htmls_coref_cache_fname: str = args.htmls_coref_cache with open(html_fname, "rb") as f_html: htmls_lookup = pickle.load(f_html) htmls_lookup_coref = load_cache(htmls_coref_cache_fname) names = get_all_objects(objects_path, work_dir) logger.info(f'Number of objects: {len(names)}') spacy.require_gpu() nlp = spacy.load('en_core_web_sm') neuralcoref.add_to_pipe(nlp) find_corefs(htmls_coref_cache_fname, htmls_lookup, htmls_lookup_coref, names, nlp) with open(htmls_coref_cache_fname, 'wb') as f: pickle.dump(htmls_lookup_coref, f, pickle.HIGHEST_PROTOCOL) logger.info('Finished')
import yaml from box import Box from learning_sizes_evaluation.evaluate import coverage_accuracy_relational, RelationalResult from logging_setup_dla.logging import set_up_root_logger from matplotlib import pyplot as plt, colors, cm from matplotlib.scale import SymmetricalLogTransform from scipy import stats from scipy.stats import pearsonr, spearmanr from sklearn.linear_model import Ridge from visual_size_comparison.config import VisualConfig from visual_size_comparison.propagation import build_cooccurrence_graph, Pair, VisualPropagation from breds.breds_inference import find_similar_words, BackoffSettings, comparison_dev_set from breds.config import Config, load_word2vec set_up_root_logger(f'INFERENCE_VISUAL_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.path.join(os.getcwd(), 'logs')) logger = logging.getLogger(__name__) def main(): with open("config.yml", "r") as ymlfile: cfg = Box(yaml.safe_load(ymlfile)) # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None) test_pairs, unseen_objects = comparison_dev_set(cfg) unseen_objects = [o.replace('_', " ") for o in unseen_objects] # TODO check whether the objects aren't in the bootstrapped objects visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors) config = Config(cfg, visual_config)
from size_comparisons.inference.baseline_numeric_gaussians import find_confidences_for_pairs_lazy, \ BaselineNumericGaussians from size_comparisons.parse_objects import InputsParser from size_comparisons.scraping.compilation import fill_dataframe import logging from datetime import datetime from logging_setup_dla.logging import set_up_root_logger import os set_up_root_logger(f'SEEDLAZY_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.getcwd()) logger = logging.getLogger(__name__) def main(): input_parser = InputsParser() labels = input_parser.retrieve_labels() names = input_parser.retrieve_names() data = fill_dataframe(names, labels) test_pairs = input_parser.retrieve_test_pairs() test_pairs_tuples = list( test_pairs.itertuples(name='TestPair', index=False)) find_confidences_for_pairs_lazy(data, test_pairs_tuples) if __name__ == "__main__": try: main() except Exception: logger.exception("Unhandled exception")
from box import Box import numpy as np from logging_setup_dla.logging import set_up_root_logger from pandas import DataFrame from sklearn.ensemble import IsolationForest from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler, MinMaxScaler from visual_size_comparison.config import VisualConfig from visual_size_comparison.propagation import build_cooccurrence_graph, Pair, VisualPropagation from breds.config import Config from sklearn.svm import SVC, LinearSVC from matplotlib import pyplot as plt from learning_sizes_evaluation.evaluate import precision_recall, range_distance, Result set_up_root_logger(f'RANGES_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.path.join(os.getcwd(), 'logs')) logger = logging.getLogger(__name__) def iterativily_find_size(lower_bounds_sizes, upper_bounds_sizes): l = lower_bounds_sizes.copy() u = upper_bounds_sizes.copy() total_objects = len(u) + len(l) size_scale = len(l) / total_objects logger.info(f'Scale: {size_scale}') count_l = 0 count_r = 0 while len(u) >= 1 and len(l) >= 1 and max(l) > min(u): r = random.random() if r < size_scale: l.remove(max(l))
import fileinput import logging import os from collections import namedtuple from math import ceil, floor import pandas as pd from logging_setup_dla.logging import set_up_root_logger from matplotlib.scale import SymmetricalLogTransform from matplotlib import pyplot as plt import numpy as np set_up_root_logger('ANALYZETEST', os.path.join(os.getcwd(), 'logs')) logger = logging.getLogger(__name__) def main(): anouk: pd.DataFrame = pd.read_csv('data_numeric/VG_YOLO_intersection_test_annotated_anouk.csv') anouk: pd.DataFrame = anouk.astype({'object': str}) anouk.set_index(['object'], inplace=True, drop=False) bram: pd.DataFrame = pd.read_csv('data_numeric/VG_YOLO_intersection_test_annotated_bram.csv') bram: pd.DataFrame = bram.astype({'object': str}) bram.set_index(['object'], inplace=True, drop=False) assert len(anouk.keys()) == len(bram.keys()) bram_no_size = [line.strip().lower() for line in fileinput.input('data_numeric/hard_words_bram.txt')] anouk_no_size = [line.strip().lower() for line in fileinput.input('data_numeric/hard_words_anouk.txt')] remove = ['snow', 'architecture','toilet water'] Result = namedtuple('Result', ['object', 'min', 'max']) results = list()
import random import matplotlib.pyplot as plt import numpy as np import pandas as pd import tqdm from logging_setup_dla.logging import set_up_root_logger from matplotlib import colors, cm from scipy import stats from size_comparisons.exploration.explore_infoboxes import Record, search_infoboxes from size_comparisons.parse_objects import InputsParser from size_comparisons.scraping.analyze import retrieve_synset SAMPLE = False set_up_root_logger('INFOBOXES', os.getcwd()) logger = logging.getLogger(__name__) def main(): # data = pd.read_csv('D:\GitHubD\size-comparisons\data\manually_selected.csv') # objects = data['object'] inputparser = InputsParser() names = inputparser.retrieve_names() # names = [line.strip() for line in fileinput.input('D:\GitHubD\size-comparisons\examples\exploration\VisualGenome_REFORMAT.txt')] labels = inputparser.retrieve_labels() fname_records = 'records.pkl' if os.path.exists(fname_records): with open(fname_records, 'rb') as f: records = pickle.load(f) else: records = [Record(name, labels[i]) for i, name in enumerate(names)]
from size_comparisons.parse_objects import InputsParser from size_comparisons.scraping.analyze import retrieve_synset import logging from datetime import datetime from logging_setup_dla.logging import set_up_root_logger import os set_up_root_logger(f'ERRORANALYSIS_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.getcwd()) logger = logging.getLogger(__name__) PRINT_HTML = False analyzed_terms = ['n01581984'] def main(): input_parser = InputsParser() if PRINT_HTML: htmls = input_parser.retrieve_google_results_html() synset_names = [retrieve_synset(label)._name for label in analyzed_terms] regex_sizes = input_parser.retrieve_regex_scraper_sizes() regex_contexts = input_parser.retrieve_regex_scraper_contexts() for i, term in enumerate(analyzed_terms): logger.info(term) logger.info(synset_names[i]) if PRINT_HTML: with open('htmls.txt', 'w') as f:
import pickle from size_comparisons.inference.baseline_numeric_gaussians import BaselineNumericGaussians from size_comparisons.parse_objects import InputsParser from size_comparisons.scraping.compilation import fill_dataframe import logging from datetime import datetime from logging_setup_dla.logging import set_up_root_logger import os set_up_root_logger(f'NUMERIC_SEED_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.getcwd()) logger = logging.getLogger(__name__) selected = [ 'tiger', 'insect', 'ocean', 'cat', 'dog', 'crown', 'neuropteron', 'diving suit', 'light-emitting diode', 'stone' ] def main(): input_parser = InputsParser() labels = input_parser.retrieve_labels() names = input_parser.retrieve_names() data = fill_dataframe(names, labels) # mask = data['name'].isin(selected) # data = data[mask] baseline = BaselineNumericGaussians(data) baseline.fill_adjacency_matrix() pickle.dump(baseline, open(input_parser.data_dir / 'baseline.p', 'rb'))
import logging import os import pandas as pd import tqdm from logging_setup_dla.logging import set_up_root_logger from sklearn.metrics import precision_score, recall_score from reject_abstract_objects.reject import check_abstract set_up_root_logger(f'reject', os.path.join(os.getcwd(), 'logs')) logger = logging.getLogger(__name__) THE = True def main(): """Run an example of the abstract word rejection system.""" input = pd.read_csv('test_annotated.csv') input = input[input['abstract'] != -1] logger.info(f"Percentage abstract: {input['abstract'].mean()}") y_true = list(input['abstract']) y_pred = list() no_results = set() for entity in tqdm.tqdm(list(input['object'])): res, no_result = check_abstract(entity, the=THE) y_pred.append(res) if no_result:
import json from argparse import ArgumentParser from size_comparisons.parse_objects import InputsParser from size_comparisons.scraping.frequencies_wikipedia import find_frequencies_wikipedia import logging from datetime import datetime from logging_setup_dla.logging import set_up_root_logger import os set_up_root_logger(f'FREQSWIKI_{datetime.now().strftime("%d%m%Y%H%M%S")}', os.getcwd()) logger = logging.getLogger(__name__) def main(): """Retrieve frequencies from a wikipedia Lucene index.""" parser = ArgumentParser() parser.add_argument('--index', type=str, required=True) args = parser.parse_args() index_dir = args.index inputparser = InputsParser() names = inputparser.retrieve_names() fname = inputparser.data_dir / 'frequencies_wikipedia.json' freqs = find_frequencies_wikipedia(names, index_dir) with open(fname, 'w') as wf: json.dump(freqs, wf)
import logging import os import pickle from logging_setup_dla.logging import set_up_root_logger from nltk import tokenize import numpy as np set_up_root_logger(f'HTMLSSTATS', os.path.join(os.getcwd(), 'logs')) logger = logging.getLogger(__name__) def main(): with open('htmls.pkl', "rb") as f_html: results: dict = pickle.load(f_html) sizes = [] for htmls in results.values(): for html in htmls: words = tokenize.word_tokenize(html) sizes.append(len(words)) logger.info(f'Mean doc size: {np.mean(sizes)}') logger.info(f'Median doc size: {np.median(sizes)}') if __name__ == "__main__": try: main() except Exception:
import os import pickle import random import numpy as np import pandas as pd import yaml from box import Box from learning_sizes_evaluation.evaluate import coverage_accuracy_relational, RelationalResult from learning_sizes_evaluation.monte_carlo_permutation_test import permutation_test from logging_setup_dla.logging import set_up_root_logger from breds.breds_inference import comparison_dev_set set_up_root_logger(f'COMBINE', os.path.join(os.getcwd(), 'logs')) logger = logging.getLogger(__name__) def get_result(golds, preds, tag, notes): coverage, selectivity = coverage_accuracy_relational(golds, preds, notes=notes) logger.info(f'Coverage: {coverage}') logger.info(f'selectivity: {selectivity}') return RelationalResult(tag, selectivity, coverage) def random_combination(visual, linguistic): """Use a random combination of the linguistic and visual system."""