Example #1
0
    def from_params(cls, params: Params) -> 'QaSrlReader':
        token_idxr_params = params.pop(
            'token_indexers',
            Params({"tokens": {
                "type": "single_id",
                "lowercase_tokens": True
            }}))
        token_indexers = {
            idxr_name: TokenIndexer.from_params(idxr_params)
            for idxr_name, idxr_params in token_idxr_params.items()
        }
        has_provinence = params.pop("has_provinence", False)
        bio_labels = params.pop("bio_labels", False)

        min_answers = params.pop("min_answers", 3)
        min_valid_answers = params.pop("min_valid_answers", 3)

        question_sources = params.pop("question_sources", None)

        params.assert_empty(cls.__name__)
        return QaSrlReader(token_indexers=token_indexers,
                           has_provinence=has_provinence,
                           bio_labels=bio_labels,
                           min_answers=min_answers,
                           min_valid_answers=min_valid_answers,
                           question_sources=question_sources)
Example #2
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 entity_indexer: TokenIndexer = TokenIndexer.from_params(
                     Params(INDEXER_DEFAULT)),
                 granularity: str = "sentence",
                 mention_generator: MentionGenerator = None,
                 should_remap_span_indices: bool = True,
                 entity_disambiguation_only: bool = False,
                 extra_candidate_generators: Dict[str,
                                                  MentionGenerator] = None):

        lazy = False
        super().__init__(lazy)
        self.token_indexers = token_indexers or {
            "token": SingleIdTokenIndexer("token")
        }
        self.entity_indexer = {"ids": entity_indexer}
        self.separator = {"*NL*"}
        if granularity == "sentence":
            self.separator.add(".")

        if granularity not in {"sentence", "paragraph"}:
            raise ConfigurationError(
                "Valid arguments for granularity are 'sentence' or 'paragraph'."
            )

        self.entity_disambiguation_only = entity_disambiguation_only
        self.mention_generator = mention_generator or WikiCandidateMentionGenerator(
        )
        self.should_remap_span_indices = should_remap_span_indices

        self.extra_candidate_generators = extra_candidate_generators
 def from_params(cls, params: Params) -> 'CsvClassificationReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     input = params.pop('pos_input', None)
     gold_label = params.pop('pos_gold_label', None)
     skip_header = params.pop('skip_header', True)
     delimiter = params.pop('delimiter', None)
     token_indexers = TokenIndexer.from_params(
         params.pop('token_indexers', None))
     params.assert_empty(cls.__name__)
     return CsvClassificationReader(tokenizer=tokenizer,
                                    token_indexers=token_indexers,
                                    skip_header=skip_header,
                                    delimiter=delimiter,
                                    input=input,
                                    gold_label=gold_label)
Example #4
0
    def custom_dict_from_params(
            params: Params) -> 'Dict[str, TokenIndexer]':  # type: ignore
        """
        We typically use ``TokenIndexers`` in a dictionary, with each ``TokenIndexer`` getting a
        name.  The specification for this in a ``Params`` object is typically ``{"name" ->
        {indexer_params}}``.  This method reads that whole set of parameters and returns a
        dictionary suitable for use in a ``TextField``.

        Because default values for token indexers are typically handled in the calling class to
        this and are based on checking for ``None``, if there were no parameters specifying any
        token indexers in the given ``params``, we return ``None`` instead of an empty dictionary.
        """
        token_indexers = {}
        for name, indexer_params in params.items():
            token_indexers[name] = TokenIndexer.from_params(indexer_params)
        if token_indexers == {}:
            token_indexers = None
        return token_indexers
Example #5
0
def load_model(model_name="conll_full_elmo"):
    """
    Load both vocabulary and model and create and instance of
    HMTL full model.
    """
    if model_name not in [
            "conll_small_elmo", "conll_medium_elmo", "conll_full_elmo"
    ]:
        raise ValueError(f"{model_name} is not a valid name of model.")
    serialization_dir = "model_dumps" + "/" + model_name
    params = Params.from_file(
        params_file=os.path.join(serialization_dir, "config.json"))

    # Load TokenIndexer
    task_keys = [key for key in params.keys() if re.search("^task_", key)]
    token_indexer_params = params.pop(task_keys[-1]).pop("data_params").pop(
        "dataset_reader").pop("token_indexers")
    # see https://github.com/allenai/allennlp/issues/181 for better syntax
    token_indexers = {}
    for name, indexer_params in token_indexer_params.items():
        token_indexers[name] = TokenIndexer.from_params(indexer_params)

    # Load the vocabulary
    logger.info("Loading Vocavulary from %s",
                os.path.join(serialization_dir, "vocabulary"))
    vocab = Vocabulary.from_files(os.path.join(serialization_dir,
                                               "vocabulary"))
    logger.info("Vocabulary loaded")

    # Create model and load weights
    model_params = params.pop("model")
    model = Model.from_params(vocab=vocab,
                              params=model_params,
                              regularizer=None)
    model_state_path = os.path.join(serialization_dir, "weights.th")
    model_state = torch.load(model_state_path, map_location="cpu")
    model.load_state_dict(state_dict=model_state)

    return model, vocab, token_indexers
    raise Exception('w0t?')

if split == 'test':
    base_path += 'test/'
elif split == 'valid':
    base_path += 'valid/'

articles = db.articles.find({
    'split': split
}, projection=['_id']).sort('_id', pymongo.ASCENDING)

tokenizer = Tokenizer.from_params(
    config.get('dataset_reader').get('tokenizer'))
indexer_params = config.get('dataset_reader').get('token_indexers')
token_indexer = {
    k: TokenIndexer.from_params(p)
    for k, p in indexer_params.items()
}
vocab = Vocabulary.from_params(config.get('vocabulary'))

roberta = torch.hub.load('pytorch/fairseq:2f7e3f3323', 'roberta.large')
roberta.eval()

ids = np.load(base_path + '_ids.npy')
#ids = np.load("/a/home/cc/students/cs/shlomotannor/nlp_course/newscaptioning/_missing_mask.npy")

if reverse:
    ids = ids[::-1]

#['_id', 'web_url', 'snippet', 'lead_paragraph', 'abstract', 'print_page', 'blog', 'source', 'multimedia', 'headline', \
# 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'byline',\
Example #7
0
    def initialize(self):
        # We need to initialize the model inside self.run and not self.__init__
        # to ensure that the model loads in the correct thread.
        config_path = 'expt/nytimes/9_transformer_objects/config.yaml'
        logger.info(f'Loading config from {config_path}')
        config = yaml_to_params(config_path, overrides='')
        prepare_environment(config)
        vocab = Vocabulary.from_params(config.pop('vocabulary'))
        model = Model.from_params(vocab=vocab, params=config.pop('model'))
        model = model.eval()

        model_path = 'expt/nytimes/9_transformer_objects/serialization/best.th'
        logger.info(f'Loading best model from {model_path}')
        best_model_state = torch.load(model_path,
                                      map_location=torch.device('cpu'))
        model.load_state_dict(best_model_state)

        self.model = model.to(self.device)

        logger.info('Loading roberta model.')
        roberta = torch.hub.load('pytorch/fairseq:2f7e3f3323', 'roberta.base')
        self.bpe = roberta.bpe
        self.indices = roberta.task.source_dictionary.indices

        logger.info('Loading face detection model.')
        self.mtcnn = MTCNN(keep_all=True, device=self.device)
        self.inception = InceptionResnetV1(pretrained='vggface2').eval()

        self.resnet = resnet152()
        self.resnet = self.resnet.to(self.device).eval()

        cfg = 'tell/yolov3/cfg/yolov3-spp.cfg'
        weight_path = 'data/yolov3-spp-ultralytics.pt'
        self.darknet = Darknet(cfg, img_size=416)
        attempt_download(weight_path)
        self.darknet.load_state_dict(
            torch.load(weight_path, map_location=self.device)['model'])
        self.darknet.to(self.device).eval()

        # Get names and colors
        self.names = load_classes('tell/yolov3/data/coco.names')
        random.seed(123)
        self.colors = [[random.randint(0, 255) for _ in range(3)]
                       for _ in range(len(self.names))]

        self.preprocess = Compose([
            Resize(256),
            CenterCrop(224),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        data_iterator = BasicIterator(batch_size=4)
        data_iterator.index_with(model.vocab)
        self.data_iterator = data_iterator

        self.tokenizer = Tokenizer.from_params(
            config.get('dataset_reader').get('tokenizer'))

        indexer_params = config.get('dataset_reader').get('token_indexers')

        self.token_indexers = {
            k: TokenIndexer.from_params(p)
            for k, p in indexer_params.items()
        }
Example #8
0
    
    ### Load Vocabulary from files ###
    logging.info("Loading Vocavulary from %s", os.path.join(serialization_dir, "vocabulary"))
    vocab = Vocabulary.from_files(os.path.join(args.serialization_dir, "vocabulary"))
    logger.info("Vocabulary loaded")
    
    ### Create model ###
    model_params = params.pop("model")
    model = Model.from_params(vocab = vocab, params = model_params, regularizer = None)
    best_model_state_path = os.path.join(serialization_dir, "best_{}.th".format(args.task))
    best_model_state = torch.load(best_model_state_path)
    model.load_state_dict(state_dict = best_model_state)
    
    ### Create token indexer ###
    token_index = {}
    task_keys = [key for key in params.keys() if re.search("^task_", key)] 
    token_indexer_params = params.pop(task_keys[-1]).pop("data_params").pop("dataset_reader").pop("token_indexers")
    for name, indexer_params in token_indexer_params.items(): 
        token_index[name] = TokenIndexer.from_params(indexer_params) 
    
    params_senteval['encoder'] = model
    
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = ['Length', 'WordContent', 'Depth', 'TopConstituents',
                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
                      'OddManOut', 'CoordinationInversion']
    results = se.eval(transfer_tasks)
    
    print(results)
    logging.info("SentEval(uation) Finished")
Example #9
0
def main():
    base_path = "/a/home/cc/students/cs/shlomotannor/nlp_course/newscaptioning/dbr/"
    agentsnpy = 'dbr/_agents.npy'

    config_path = "expt/nytimes/BM2/config.yaml"
    config = yaml_to_params(config_path, overrides='')

    host = 'nova'
    hosti = [i.startswith('host') for i in sys.argv]
    if True in hosti:
        print(f'using host {host}')
        host = sys.argv[hosti.index(True)].split("_")[1]

    client = pymongo.MongoClient(host=host, port=27017)
    db = client.nytimes

    split = 'train'
    reverse = False
    full_search = False
    rand = False
    offset = False
    agent = False
    prints = False

    if 'create_agents' in sys.argv:
        free = np.arange(101)
        np.save(agentsnpy, free)
        return

    if 'train' in sys.argv:
        split = 'train'

    if 'test' in sys.argv:
        split = 'test'

    if 'valid' in sys.argv:
        split = 'valid'

    if 'r' in sys.argv:
        reverse = True

    if 'fs' in sys.argv:
        full_search = True

    if 'rand' in sys.argv:
        rand = True
        full_search = True

    if 'print' in sys.argv:
        prints = True

    offseti = [i.startswith('offset') for i in sys.argv]
    if True in offseti:
        offseti = offseti.index(True)
        offset = True
        offseti = sys.argv[offseti][len("offset"):]
        offsetend = sys.maxsize

        if "_" in offseti:
            offseti, offsetend = offseti.split("_")
            offsetend = int(offsetend)

        offseti = int(offseti)

        print('offset', offseti, offsetend)

    agenti = [i.startswith('agent') for i in sys.argv]
    if True in agenti:
        agenti = agenti.index(True)
        agent = True
        agenti = sys.argv[agenti][len("agent"):]
        if agenti == 'r':
            free = np.load(agentsnpy)
            if free:
                agenti = np.random.choice(free)
                free = free[~np.isin(free, agenti)]
                np.save(agentsnpy, free)
            else:
                raise Exception("no more free agent indices")

        else:
            agenti = int(agenti)

    if split not in ['train', 'test', 'valid']:
        raise Exception('w0t?')

    if split == 'test':
        base_path += 'test/'
    elif split == 'valid':
        base_path += 'valid/'

    tokenizer = Tokenizer.from_params(
        config.get('dataset_reader').get('tokenizer'))
    indexer_params = config.get('dataset_reader').get('token_indexers')
    token_indexer = {
        k: TokenIndexer.from_params(p)
        for k, p in indexer_params.items()
    }
    vocab = Vocabulary.from_params(config.get('vocabulary'))

    roberta = torch.hub.load('pytorch/fairseq:2f7e3f3323', 'roberta.large')
    roberta.eval()

    if agent:
        agent_ids = 2000
        ids = np.load(base_path + '_ids_missing.npy')
        ids = ids[agenti * agent_ids:(agenti + 1) * agent_ids]

    else:
        ids = np.load(base_path + '_ids.npy')

    if reverse:
        ids = ids[::-1]

    if rand:
        np.random.shuffle(ids)

    if offset:
        ids = ids[offseti:offsetend]

    # ['_id', 'web_url', 'snippet', 'lead_paragraph', 'abstract', 'print_page', 'blog', 'source', 'multimedia', 'headline', \
    # 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'byline',\
    # 'type_of_material', 'word_count', 'slideshow_credits', 'scraped', 'parsed', 'error', 'image_positions',\
    # 'parsed_section', 'n_images', 'language', 'parts_of_speech', 'n_images_with_faces', 'detected_face_positions', 'split']

    projection = ['_id', 'parsed_section']

    sfrom = True
    l = [os.path.basename(i) for i in glob(base_path + "*")]

    if prints:
        ids = tqdm(ids)

    for aid in ids:
        if not agent:
            if full_search:
                l = [os.path.basename(id) for id in glob(base_path + "*[!m]")]
                if aid in l:
                    continue
            else:
                if sfrom:
                    if aid in l:
                        continue
                    else:
                        sfrom = False
                        del l

        a = db.articles.find_one({'_id': {'$eq': aid}}, projection=projection)
        sections = a['parsed_section']
        paragraphs = [p for p in sections if p['type'] == 'paragraph']
        if not len(paragraphs):
            continue
        tokens = [tokenizer.tokenize(c['text']) for c in paragraphs]
        context = ListTextField([TextField(p, token_indexer) for p in tokens])
        context.index(vocab)
        context = context.as_tensor(context.get_padding_lengths())
        r = roberta.extract_features(context['roberta']).detach()

        torch.save(r, base_path + aid)