Ejemplo n.º 1
0
def get_envs(env_files=None):
    dataset = []
    if env_files is None:
        fns = [get_train_shard_path(i) for i in range(0, 30)]
    else:
        fns = env_files

    for fn in fns:
        dataset += load_jsonl(fn)
    tables = load_jsonl(table_file)

    table_dict = dict([(table['name'], table) for table in tables])

    # Load pretrained embeddings.
    embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file )

    with open(en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    # Create environments.
    envs = create_envs(table_dict, dataset, en_vocab, embedding_model)

    return envs
Ejemplo n.º 2
0
 def run(self):
     # TODO: don't read it all into memory
     movie_data = load_jsonl(self.input()[0])
     tv_data = load_jsonl(self.input()[1])
     video_data = self._filter(movie_data + tv_data)
     video_data = self._normalize(video_data)
     with self.output().open('w') as fp:
         for obj in video_data:
             line = '{}\n'.format(json.dumps(obj))
             fp.write(line)
Ejemplo n.º 3
0
def get_infobox_data_multi(in_file, out_file):
    """ Same as get_infobox_data - only utilizing multiprocessing.
    Args
        :param in_file: File from where to take the titles and texts for crawling and mapping.
        :param out_file: Filename/path where to save the created mapped texts and triples.
    Results
        :return: None
    """
    data = utils.load_jsonl(in_file)
    data_dict = utils.split_dict_equally(data,
                                         N_CORES)  # out_dict , preped_data
    dict_length = len(data_dict)
    pool = Pool(N_CORES)
    ts = time()
    with open(out_file, 'a', encoding="utf8") as outfile:
        for idx, dlist in enumerate(data_dict):
            if idx == dict_length - 1:
                print("stop")
            dict_list = utils.split_dict_equally(dlist, N_CORES)
            for train_exls in pool.map(process_batches, dict_list):
                if len(train_exls) > 0:
                    for train_exl in train_exls:
                        json.dump(train_exl, outfile)
                        outfile.write('\n')
    duration = time() - ts / 60
    print(str(duration) + " sec")
    utils.count_json_lines(out_file)
Ejemplo n.º 4
0
    def _read(self, file_path) -> Iterator[Instance]:
        # TODO:读取数据
        data = load_jsonl(file_path)
        #suffix = file_path.split("_")[-1].split('.')[0]

        label_maps = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        data_amount_limit_map = {'train': 600000, 'dev': 100000, 'test': 100000}
        data_limit = 0
        for key, v in data_amount_limit_map.items():
            if (file_path.find(key) != -1):
                data_limit = data_amount_limit_map[key]
                break
        for data_idx, item in enumerate(data):
            if (data_idx > data_limit):
                break
            # 将claim fact、clue fact、true candi、false candi都读进来
            left_sent = item['sentence1']
            right_sent = item['sentence2']


            if item["gold_label"] not in label_maps.keys():
                continue

            label = label_maps[item["gold_label"]]

            left_input_tokens = self.tokenizer(str(left_sent))
            right_input_tokens = self.tokenizer(str(right_sent))

            if (len(left_input_tokens) < self.min_seq_len or len(right_input_tokens) < self.min_seq_len):
                continue

            yield self.text_to_instances(
                left_input_tokens,
                right_input_tokens,
                label
            )
Ejemplo n.º 5
0
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import array as arr
from sklearn import metrics
from sklearn.model_selection import train_test_split
import svm

countH = 0
countL = 0
lista_opt = []
call = []
lea = []
xor = []
listcount = []

test = utils.load_jsonl('test_dataset_blind.jsonl')

dimension = 0

for current in test:
    # print(json.dumps(current, indent=2))
    instructions = current['instructions']

    # print("current instructions list")
    #print(json.dumps(instructions, indent=2))
    # print(len(instructions), optimizer)
    # print("current opt ", optimizer)
    dimension += 1
    
    call.append(utils.count_appearances('call ', instructions))
    lea.append(utils.count_appearances('lea ', instructions))
Ejemplo n.º 6
0
from utils import load_jsonl, save_jsonl
from urllib.parse import urlparse

# paras = load_jsonl('pc.paragraphs.jsonl')
paras = load_jsonl('blog_kb.paragraphs.jsonl')
print(len(paras))
print(paras[-1])
results = []
out = []
for i, p in enumerate(paras):
    text = p['text'].lower()
    url = p['meta']['url']
    if not any(k in url.lower() for k in ('kb', 'tour', 'faq', 'blog')):
        continue
    out.append(p)
    results.append(url)

print('%d total' % len(results))
results = sorted(set(results))
print('%d unique' % len(results))

for i, url in enumerate(results[:20]):
    o = urlparse(url)
    print('%4d: %s' % (i, o))

# save_jsonl('kb.paragraphs.jsonl', out)
save_jsonl('final.paragraphs.jsonl', out)
Ejemplo n.º 7
0
texts = [
    'Today is sunny',
    'I hate bunnies',
    '''Chris Goult recently joined us as our shiny new channel marketing manager.  Chris comes to us from Konica Minolta Business Solutions Australia, with loads of knowledge and enthusiasm for all things marketing and channel related. ''',
    '''You don't want your precious funds ending up as wasted paper in recycle bins.''',
    '''At the core of PaperCut MF is the ability to interface directly with MFD hardware to track off-the-glass functions such as copy, scan, fax and secure print release. PaperCut has worked directly with leading MFD manufacturers to bring our software directly to the MFD at a firmware level. To complete the solution offering across all devices PaperCut MF also supports hardware copier terminals from multiple vendors. ''',
]
for text in texts:
    print('-' * 80)
    print(text)
    doc = nlp(text)
    print(doc.cats)

print('=' * 80)
paras = load_jsonl('pc.paragraphs.jsonl')
print(len(paras))
print(paras[-1])
results = []
for i, p in enumerate(paras):
    text = p['text']
    url = p['meta']['url']
    doc = nlp(text)
    score = doc.cats
    # print(sorted(score))
    # assert False

    results.append((text, url, score))
    if i % 1000 == 50:
        # text = text.decode('utf-8', errors='ignore')
        text = text.encode('cp850', 'replace').decode('cp850')
Ejemplo n.º 8
0
#@autor: FLAVIO LORENZI mat. 1662963

import json_lines
import json
import utils
import numpy as np

''' This script let you know how to manage JSONL and JSON items'''
# JSONL is list of JSON item

data = utils.load_jsonl('partial2.jsonl')

"""prior probability"""
prob_L = utils.get_opt_probability(data, 'L')
prob_H = 1 - prob_L
'''
print("")
print("prior probability LOW", prob_L)
print("prior probability HIGH", prob_H)
print("")
'''
dimension = 0

for current in data:
    # print(json.dumps(current, indent=2))
    [instructions, optimizer] = utils.get_instructions_optimizer(current)
    # print("current instructions list")
    # print(json.dumps(instructions, indent=2))
    # print(len(instructions), optimizer)
    # print("current opt ", optimizer)
    dimension += 1
Ejemplo n.º 9
0
    # Load corpus
    fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
    if os.path.exists(fn):
        print('Loading cached dataset...')
        corpus = torch.load(fn)
    else:
        print('Producing dataset...')
        corpus = data.Corpus(args.data)
        torch.save(corpus, fn)

    # Load data
    train_data, val_data, test_data = data_load(corpus)

    # Initiate model
    ntokens = len(corpus.dictionary)
    train_tokens = load_jsonl(args.data + '/' + 'train.jsonl')
    valid_tokens = load_jsonl(args.data + '/' + 'valid.jsonl')
    test_tokens = load_jsonl(args.data + '/' + 'test.jsonl')
    tok2i = build_tok2i(
        list(
            chain.from_iterable(
                [d['tokens'] for d in (train_tokens + valid_tokens)])))
    i2tok = {j: i for i, j in tok2i.items()}

    # Add more attributes to args
    args.n_classes = len(tok2i)

    # # Set decoder config
    decoder_config = {
        'fc_dim': args.fc_dim,
        'dec_lstm_dim': args.dec_lstm_dim,
Ejemplo n.º 10
0
from utils import load_jsonl, save_jsonl

paras1 = load_jsonl('kb.paragraphs.jsonl')
paras2 = load_jsonl('blog.paragraphs.jsonl')

print('kb')
print(len(paras1))
print(paras1[-1])
print('blog')
print(len(paras2))
print(paras2[-1])

out = []
out_text = set()

for paras in (paras1, paras2):
    for i, p in enumerate(paras):
        text = p['text']
        if text in out_text:
            continue
        out_text.add(text)
        out.append(p)

print('%d total' % len(out))
print('%d unique' % len(out_text))

save_jsonl('blog_kb.paragraphs.jsonl', out)
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description=
        """Computes rationale and final class classification scores""",
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '--data_dir',
        dest='data_dir',
        required=True,
        help='Which directory contains a {train,val,test}.jsonl file?')
    parser.add_argument('--split',
                        dest='split',
                        required=True,
                        help='Which of {train,val,test} are we scoring on?')
    parser.add_argument('--strict',
                        dest='strict',
                        required=False,
                        action='store_true',
                        default=False,
                        help='Do we perform strict scoring?')
    parser.add_argument('--results',
                        dest='results',
                        required=True,
                        help="""Results File
    Contents are expected to be jsonl of:
    {
        "annotation_id": str, required
        # these classifications *must not* overlap
        # these classifications *must not* overlap
        "rationales": List[
            {
                "docid": str, required
                "hard_rationale_predictions": List[{
                    "start_token": int, inclusive, required
                    "end_token": int, exclusive, required
                }], optional,
                # token level classifications, a value must be provided per-token
                # in an ideal world, these correspond to the hard-decoding above.
                "soft_rationale_predictions": List[float], optional.
                # sentence level classifications, a value must be provided for every
                # sentence in each document, or not at all
                "soft_sentence_predictions": List[float], optional.
            }
        ],
        # the classification the model made for the overall classification task
        "classification": str, optional
        # A probability distribution output by the model. We require this to be normalized.
        "classification_scores": Dict[str, float], optional
        # The next two fields are measures for how faithful your model is (the
        # rationales it predicts are in some sense causal of the prediction), and
        # how sufficient they are. We approximate a measure for comprehensiveness by
        # asking that you remove the top k%% of tokens from your documents,
        # running your models again, and reporting the score distribution in the
        # "comprehensiveness_classification_scores" field.
        # We approximate a measure of sufficiency by asking exactly the converse
        # - that you provide model distributions on the removed k%% tokens.
        # 'k' is determined by human rationales, and is documented in our paper.
        # You should determine which of these tokens to remove based on some kind
        # of information about your model: gradient based, attention based, other
        # interpretability measures, etc.
        # scores per class having removed k%% of the data, where k is determined by human comprehensive rationales
        "comprehensiveness_classification_scores": Dict[str, float], optional
        # scores per class having access to only k%% of the data, where k is determined by human comprehensive rationales
        "sufficiency_classification_scores": Dict[str, float], optional
        # the number of tokens required to flip the prediction - see "Is Attention Interpretable" by Serrano and Smith.
        "tokens_to_flip": int, optional
    }
    When providing one of the optional fields, it must be provided for *every* instance.
    The classification, classification_score, and comprehensiveness_classification_scores
    must together be present for every instance or absent for every instance.
    """)
    parser.add_argument('--iou_thresholds',
                        dest='iou_thresholds',
                        required=False,
                        nargs='+',
                        type=float,
                        default=[0.5],
                        help='''Thresholds for IOU scoring.

    These are used for "soft" or partial match scoring of rationale spans.
    A span is considered a match if the size of the intersection of the prediction
    and the annotation, divided by the union of the two spans, is larger than
    the IOU threshold. This score can be computed for arbitrary thresholds.
    ''')
    parser.add_argument('--score_file',
                        dest='score_file',
                        required=False,
                        default=None,
                        help='Where to write results?')
    args = parser.parse_args()
    results = load_jsonl(args.results)
    docids = set(
        chain.from_iterable([rat['docid'] for rat in res['rationales']]
                            for res in results))
    docs = load_flattened_documents(args.data_dir, docids)
    verify_instances(results, docs)
    # load truth
    annotations = annotations_from_jsonl(
        os.path.join(args.data_dir, args.split + '.jsonl'))
    docids |= set(
        chain.from_iterable((ev.docid
                             for ev in chain.from_iterable(ann.evidences))
                            for ann in annotations))

    has_final_predictions = _has_classifications(results)
    scores = dict()
    if args.strict:
        if not args.iou_thresholds:
            raise ValueError(
                "--iou_thresholds must be provided when running strict scoring"
            )
        if not has_final_predictions:
            raise ValueError(
                "We must have a 'classification', 'classification_score', and 'comprehensiveness_classification_score' field in order to perform scoring!"
            )
    # TODO think about offering a sentence level version of these scores.
    if _has_hard_predictions(results):
        truth = list(
            chain.from_iterable(
                Rationale.from_annotation(ann) for ann in annotations))
        pred = list(
            chain.from_iterable(
                Rationale.from_instance(inst) for inst in results))
        if args.iou_thresholds is not None:
            iou_scores = partial_match_score(truth, pred, args.iou_thresholds)
            scores['iou_scores'] = iou_scores
        # NER style scoring
        rationale_level_prf = score_hard_rationale_predictions(truth, pred)
        scores['rationale_prf'] = rationale_level_prf
        token_level_truth = list(
            chain.from_iterable(rat.to_token_level() for rat in truth))
        token_level_pred = list(
            chain.from_iterable(rat.to_token_level() for rat in pred))
        token_level_prf = score_hard_rationale_predictions(
            token_level_truth, token_level_pred)
        scores['token_prf'] = token_level_prf
    else:
        logging.info(
            "No hard predictions detected, skipping rationale scoring")

    if _has_soft_predictions(results):
        flattened_documents = load_flattened_documents(args.data_dir, docids)
        paired_scoring = PositionScoredDocument.from_results(
            results, annotations, flattened_documents, use_tokens=True)
        token_scores = score_soft_tokens(paired_scoring)
        scores['token_soft_metrics'] = token_scores
    else:
        logging.info(
            "No soft predictions detected, skipping rationale scoring")

    if _has_soft_sentence_predictions(results):
        documents = load_documents(args.data_dir, docids)
        paired_scoring = PositionScoredDocument.from_results(results,
                                                             annotations,
                                                             documents,
                                                             use_tokens=False)
        sentence_scores = score_soft_tokens(paired_scoring)
        scores['sentence_soft_metrics'] = sentence_scores
    else:
        logging.info(
            "No sentence level predictions detected, skipping sentence-level diagnostic"
        )

    if has_final_predictions:
        flattened_documents = load_flattened_documents(args.data_dir, docids)
        class_results = score_classifications(results, annotations,
                                              flattened_documents)
        scores['classification_scores'] = class_results
    else:
        logging.info(
            "No classification scores detected, skipping classification")

    pprint.pprint(scores)

    if args.score_file:
        with open(args.score_file, 'w') as of:
            json.dump(scores, of, indent=4, sort_keys=True)