Exemple #1
0
    def __init__(self, config: "TokenizerConfiguration"):
        self._config = config

        self._end_tokens = config.end_tokens or []
        self._start_tokens = config.start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()

        self.__nlp__ = get_spacy_model(self.config.spacy_model,
                                       pos_tags=True,
                                       ner=False,
                                       parse=False)
        if config.segment_sentences and not self.__nlp__.has_pipe(
                self.__SPACY_SENTENCIZER__):
            try:
                self.__nlp__.add_pipe(self.__SPACY_SENTENCIZER__)
            # spacy < 3.0.0
            except ValueError:
                self.__nlp__.add_pipe(
                    self.__nlp__.create_pipe(self.__SPACY_SENTENCIZER__))

        if config.text_cleaning is None:
            self.text_cleaning = TextCleaning()
        else:
            self.text_cleaning = TextCleaning.from_params(
                Params(copy.deepcopy(config.text_cleaning)))
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._spacy = get_spacy_model(language,
                                      pos_tags=True,
                                      parse=True,
                                      ner=False)
        self._model._full_output_score = True
        self._beam_size = 1
        self._best_n = 5

        self._spearmanr_wel = Welford()
        self._kendalls_tau_wel = Welford()
        self._pearsonr_wel = Welford()

        self._pmr_correct = 0.0
        self._pmr_total = 0.0

        self._pos_acc_correct = 0.0
        self._pos_acc_total = 0.0

        self._spearmanr_p_values = []
        self._kendalls_tau_p_values = []
        self._pearsonr_p_values = []

        self._exclude_first = True
    def test_read_from_file_reuters_corpus_and_segments_sentences_properly(
            self, lazy, max_sequence_length):
        reader = MultiLabelTextClassificationJsonReader(
            lazy=lazy,
            segment_sentences=True,
            max_sequence_length=max_sequence_length)
        reuters_path = Path(
            "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl"
        instances = reader.read(reuters_path)
        instances = ensure_list(instances)

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False,
                                          False)

        text1 = (
            "U.K. GROWING IMPATIENT WITH JAPAN - THATCHER Prime Minister Margaret Thatcher said the"
            " U.K. Was growing more impatient with Japanese trade barriers and warned that it would"
            " soon have new powers against countries not offering reciprocal access to their"
            " markets.")
        instance1 = {"text": text1, "labels": ["acq", "trade"]}
        text2 = (
            "CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadian oil exports rose 20 pct in 1986 over"
            " the previous year to 33.96 mln cubic meters, while oil imports soared 25.2 pct to"
            " 20.58 mln cubic meters, Statistics Canada said. Production, meanwhile, was unchanged"
            " from the previous year at 91.09 mln cubic feet.")
        instance2 = {"text": text2, "labels": ["nat-gas", "crude"]}
        text3 = (
            "COFFEE, SUGAR AND COCOA EXCHANGE NAMES CHAIRMAN The New York Coffee, Sugar and Cocoa"
            " Exchange (CSCE) elected former first vice chairman Gerald Clancy to a two-year term"
            " as chairman of the board of managers, replacing previous chairman Howard Katz. Katz,"
            " chairman since 1985, will remain a board member.")
        instance3 = {"text": text3, "labels": ["sugar", "cocoa", "coffee"]}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance["text"])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["labels"].labels == instance1["labels"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["labels"].labels == instance2["labels"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["labels"].labels == instance3["labels"]
Exemple #4
0
    def __init__(
        self,
        language: str = "en_core_web_sm",
        pos_tags: bool = True,
        parse: bool = False,
        ner: bool = False,
        keep_spacy_tokens: bool = False,
        split_on_spaces: bool = False,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
    ) -> None:
        # Save these for use later in the _to_params method
        self._language = language
        self._pos_tags = pos_tags
        self._parse = parse
        self._ner = ner
        self._split_on_spaces = split_on_spaces

        self.spacy = get_spacy_model(self._language, self._pos_tags, self._parse, self._ner)

        if self._split_on_spaces:
            self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)

        self._keep_spacy_tokens = keep_spacy_tokens
        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._is_version_3 = spacy.__version__ >= "3.0"
        self._end_tokens = end_tokens or []
Exemple #5
0
 def __init__(self,
              language: str = 'en_core_web_sm',
              pos_tags: bool = False,
              parse: bool = False,
              ner: bool = False) -> None:
     self.spacy = get_spacy_model(language, pos_tags, parse, ner)
     # Modify default to use white space tokenizer, defined above
     self.spacy.tokenizer = WhitespaceTokenizer(self.spacy.vocab)
 def __init__(self,
              language: str = 'en_core_web_sm',
              pos_tags: bool = False,
              parse: bool = False,
              ner: bool = False,
              keep_spacy_tokens: bool = False) -> None:
     self.spacy = get_spacy_model(language, pos_tags, parse, ner)
     self._keep_spacy_tokens = keep_spacy_tokens
Exemple #7
0
    def __init__(
        self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm"
    ) -> None:
        super().__init__(model, dataset_reader)

        # We have to use spacy to tokenise our document here, because we need
        # to also know sentence boundaries to propose valid mentions.
        self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False)
    def test_read_from_file_ag_news_corpus_and_segments_sentences_properly(
            self, lazy: bool, label_name: str,
            max_sequence_length: Optional[int]):
        reader = TextSentimentReader(lazy=lazy,
                                     segment_sentences=True,
                                     label_name=label_name,
                                     max_sequence_length=max_sequence_length)
        ag_path = Path(DATA_DIR, 'ag_news_corpus_original.jsonl')
        if label_name == 'text_sentiment':
            ag_path = Path(DATA_DIR, 'ag_news_corpus.jsonl')
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False,
                                          False)

        text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach "
                 "Petrino Vows to Have Team Better Prepared. NASHVILLE, "
                 "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss "
                 "at home to Memphis last season is still a painful memory "
                 "for the Cardinals.")
        instance1 = {"text": text1, "label": "2"}
        text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York"
                 " Giants' starting quarterback.")
        instance2 = {"text": text2, "label": "2"}
        text3 = ("A conference dedicated to online journalism explores the "
                 "effect blogs have on news reporting. Some say they draw "
                 "attention to under-reported stories. Others struggle to "
                 "establish the credibility enjoyed by professionals.")
        instance3 = {"text": text3, "label": "4"}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance['text'])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
 def __init__(self,
              language: str = 'en_core_web_sm',
              pos_tags: bool = False,
              parse: bool = False,
              ner: bool = False,
              wst: bool = False) -> None:
     self.spacy = get_spacy_model(language, pos_tags, parse, ner)
     if wst:
         self.spacy.tokenizer = WhitespaceTokenizer(self.spacy.vocab)
Exemple #10
0
    def __init__(
        self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm"
    ) -> None:
        super().__init__(model, dataset_reader)

        # We have to use spacy to tokenise our document here, because we need
        # to also know sentence boundaries to propose valid mentions.
        self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False)
        self._word_tokenizer = custom_word_tokenizer()
        self._word_tokenize = lambda text: [token for token in split_tokens_by_hyphen(self._word_tokenizer.tokenize(text))]
 def __init__(self,
              language: str = 'en_core_web_sm',
              rule_based: bool = False) -> None:
     # we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
     self.spacy = get_spacy_model(language, parse=not rule_based, ner=False, pos_tags=False)
     if rule_based:
         # we use `sbd`, a built-in spacy module for rule-based sentence boundary detection.
         if not self.spacy.has_pipe('sbd'):
             sbd = self.spacy.create_pipe('sbd')
             self.spacy.add_pipe(sbd)
 def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None:
     # we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
     self.spacy = get_spacy_model(language, parse=not rule_based, ner=False, pos_tags=False)
     if rule_based:
         # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
         # depending on the spacy version, it could be called 'sentencizer' or 'sbd'
         sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer"
         if not self.spacy.has_pipe(sbd_name):
             sbd = self.spacy.create_pipe(sbd_name)
             self.spacy.add_pipe(sbd)
Exemple #13
0
 def __init__(self,
              language: str = 'en_core_web_sm',
              pos_tags: bool = False,
              parse: bool = False,
              ner: bool = False,
              keep_spacy_tokens: bool = False,
              never_split: List[str] = None) -> None:
     self.spacy = get_spacy_model(language, pos_tags, parse, ner)
     if never_split is not None:
         self.spacy.tokenizer = custom_tokenizer(self.spacy, never_split)
     self._keep_spacy_tokens = keep_spacy_tokens
    def __init__(self,
                 language: str = 'en_core_web_sm',
                 pos_tags: bool = False,
                 parse: bool = False,
                 ner: bool = False,
                 keep_spacy_tokens: bool = False,
                 split_on_spaces: bool = False) -> None:
        self.spacy = get_spacy_model(language, pos_tags, parse, ner)
        if split_on_spaces:
            self.spacy.tokenizer = WhitespaceTokenizer(self.spacy.vocab)

        self._keep_spacy_tokens = keep_spacy_tokens
Exemple #15
0
 def __init__(self,
              language: str = 'en_core_web_sm',
              rule_based: bool = False) -> None:
     # we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
     self.spacy = get_spacy_model(language,
                                  parse=not rule_based,
                                  ner=False,
                                  pos_tags=False)
     if rule_based:
         # we use `sbd`, a built-in spacy module for rule-based sentence boundary detection.
         if not self.spacy.has_pipe('sbd'):
             sbd = self.spacy.create_pipe('sbd')
             self.spacy.add_pipe(sbd)
Exemple #16
0
 def new_fn(data: Union[spacy.tokens.doc.Doc, Dict, str]):
     if not isinstance(data, spacy.tokens.doc.Doc):
         model = get_spacy_model(language, **kwargs)
         if isinstance(data, Dict):
             for key, val in data.items():
                 if isinstance(val, str):
                     data[key] = model(val)
         elif isinstance(data, tuple):
             data = tuple(model(tup) if isinstance(tup, str) else tup for tup in data)
         elif isinstance(data, str):
             data = model(data)
         else:
             pass
     return fn(data)
Exemple #17
0
    def test_as_array_produces_token_array(self):
        indexer = SpacyTokenIndexer()
        nlp = get_spacy_model("en_core_web_sm", parse=False, ner=False)
        tokens = [t for t in nlp("This is a sentence.")]
        field = TextField(tokens, token_indexers={"spacy": indexer})

        vocab = Vocabulary()
        field.index(vocab)

        # Indexer functionality
        array_dict = indexer.tokens_to_indices(tokens, vocab)
        assert len(array_dict["tokens"]) == 5
        assert len(array_dict["tokens"][0]) == 96

        # Check it also works with field
        lengths = field.get_padding_lengths()
        array_dict = field.as_tensor(lengths)

        assert list(array_dict["spacy"]["tokens"].shape) == [5, 96]
    def __init__(
        self,
        language: str = "en_core_web_sm",
        pos_tags: bool = False,
        parse: bool = False,
        ner: bool = False,
        keep_spacy_tokens: bool = False,
        split_on_spaces: bool = False,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
    ) -> None:
        self.spacy = get_spacy_model(language, pos_tags, parse, ner)
        if split_on_spaces:
            self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)

        self._keep_spacy_tokens = keep_spacy_tokens
        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or []
Exemple #19
0
def load_lsmdc(split):
    """
    Loads LSMDC with <someone> annotations
    #TODO: investigate filtering things.
    # 1: all sentences need verbs
    # 2: filter out all sentences that don't begin with a capital letter (these are often incomplete)
    # 3. all sentences need objects

    :return:
    """
    lsmdc = pd.read_csv(os.path.join(
        DATA_PATH, 'movies', 'LSMDC16_annos_{}.csv'.format({
            'train': 'training',
            'val': 'val',
            'test': 'test'
        }[split])),
                        sep='\t',
                        header=None,
                        names=[
                            'movie', 'start_aligned', 'end_aligned',
                            'start_extracted', 'end_extracted', 'sentence'
                        ])
    lsmdc['movie'] = lsmdc['movie'].apply(
        lambda x: '_'.join(x.split('_')[:-1]))
    del lsmdc['start_extracted']
    del lsmdc['end_extracted']

    lsmdc['start_aligned'] = _to_time(lsmdc['start_aligned'])
    lsmdc['end_aligned'] = _to_time(lsmdc['end_aligned'])

    def _fix_sent(sent):
        sent1 = remove_allcaps(sent)
        # This is really minor but if it ends with " ." then change that.
        sent2 = sent1[:-1].rstrip() + '.' if sent1.endswith(' .') else sent1
        return unidecode(sent2)

    from nltk.tokenize.moses import MosesDetokenizer
    detokenizer = MosesDetokenizer()
    spacy = get_spacy_model("en_core_web_sm",
                            pos_tags=True,
                            parse=True,
                            ner=False)

    def check_if_sent_is_grammatical(sent):
        if sent[0].islower():
            print("{} is not grammatical (lowercase start)".format(sent))
            return ''
        # Sanitize the sentence
        sent_sanitized = remove_allcaps(sent)

        # Loop over subsentences to find a good one
        for sent_parsed in spacy(sent_sanitized).sents:
            root = sent_parsed.root
            if root.pos_ != 'VERB':
                print("{} is not grammatical (noverb)".format(sent))
                pass
            elif sent_parsed[0].orth_ in ('and', 'where', 'when'):
                print("{} is not grammatical (and)".format(sent))
                pass
            elif sent_parsed[-2].orth_ in ('and', 'then'):
                print("{} is not grammatical (and then)".format(sent))
                pass
            elif not any(
                [x.dep_ in ('nsubj', 'nsubjpass') for x in sent_parsed]):
                print("{} is not grammatical (no subj)".format(sent))
                pass
            else:
                print('good! {}'.format(sent))
                return unidecode(
                    detokenizer.detokenize([x.orth_ for x in sent_parsed],
                                           return_str=True))
        return ''

    lsmdc['sentence'] = lsmdc['sentence'].apply(check_if_sent_is_grammatical)
    lsmdc = lsmdc[lsmdc['sentence'].str.len() > 0]
    return _lsmdc_to_list(lsmdc)
Exemple #20
0
    def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)

        # We have to use spacy to tokenise our document here, because we need
        # to also know sentence boundaries to propose valid mentions.
        self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False)
Exemple #21
0
import json
import spacy
from allennlp.predictors.predictor import Predictor
from allennlp.models.archival import load_archive
from allennlp.common.util import get_spacy_model

spacy_ = get_spacy_model('en_core_web_sm',
                         pos_tags=True,
                         parse=True,
                         ner=False)


# pass whcih = 0 for both parses, 1 for just depend, 2 for just const
def parse(data, which=0):
    if which != 2:
        # depend parse
        darchive = load_archive(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz"
        )
        dpred = Predictor.from_archive(darchive, 'biaffine-dependency-parser')

    if which != 1:
        # const parse
        carchive = load_archive(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz"
        )
        cpred = Predictor.from_archive(carchive, 'constituency-parser')

    for d in data:
        if which != 2:
            dep = dpred.predict_json({"sentence": d.sentence})
Exemple #22
0
def load_lm_data(fold=None, mode='train'):
    """
    Turns the sequential data into instances.
    :param split:
    :return:
    """
    # Get or make vocab
    spacy_model = get_spacy_model("en_core_web_sm",
                                  pos_tags=False,
                                  parse=False,
                                  ner=False)
    if os.path.exists('vocabulary'):
        print(
            "Loading cached vocab. caution if you're building the dataset again!!!!",
            flush=True)
        vocab = Vocabulary.from_files('vocabulary')

        with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f:
            lm_data = json.load(f)
        lm_data = [
            data_item for s in ('train', 'val', 'test')
            for data_item in lm_data[s]
        ]
    else:
        assert fold is None
        with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f:
            lm_data = json.load(f)
        lm_data = [
            data_item for s in ('train', 'val', 'test')
            for data_item in lm_data[s]
        ]
        # Manually doing this because I don't want to double count things
        vocab = Vocabulary.from_instances([
            Instance({
                'story':
                TextField(
                    [
                        Token(x) for x in ['@@bos@@'] +
                        [x.orth_ for x in spacy_model(sent)] + ['@@eos@@']
                    ],
                    token_indexers={
                        'tokens':
                        SingleIdTokenIndexer(namespace='tokens',
                                             lowercase_tokens=True)
                    })
            }) for data_item in lm_data for sent in data_item['sentences']
        ],
                                          min_count={'tokens': 3})

        vocab.get_index_to_token_vocabulary('tokens')
        vocab.save_to_files('vocabulary')
        print("VOCABULARY HAS {} ITEMS".format(
            vocab.get_vocab_size(namespace='tokens')))

    if all([
            os.path.exists('lm-{}-of-{}.pkl'.format(i, NUM_FOLDS))
            for i in range(NUM_FOLDS)
    ]):
        print("LOADING CACHED DATASET", flush=True)
        if mode == 'val':
            with open('lm-{}-of-{}.pkl'.format(fold, NUM_FOLDS), 'rb') as f:
                print("Loading split{} for {}".format(fold, mode))
                instances = pkl.load(f)
        else:
            instances = []
            for other_fold in range(NUM_FOLDS):
                if other_fold != fold:
                    with open('lm-{}-of-{}.pkl'.format(other_fold, NUM_FOLDS),
                              'rb') as f:
                        print("Loading split{} for {}".format(
                            other_fold, mode))
                        instances += pkl.load(f)
        return instances, vocab

    print("MAKING THE DATASET", flush=True)
    assert fold is None
    for item in tqdm(lm_data):
        item['sentences_tokenized'] = [[st.orth_ for st in spacy_model(sent)]
                                       for sent in item['sentences']]

    def _to_instances(data):
        # flatten this
        instances = []
        for item in data:
            for s1, s2 in pairwise(item['sentences_tokenized']):
                instances.append((
                    Instance({
                        'story':
                        TextField(
                            [
                                Token(x)
                                for x in ['@@bos@@'] + s1 + s2 + ['@@eos@@']
                            ],
                            token_indexers={
                                'tokens':
                                SingleIdTokenIndexer(namespace='tokens',
                                                     lowercase_tokens=True)
                            })
                    }),
                    s1,
                    s2,
                    item,
                ))
        return instances

    random.seed(123456)
    random.shuffle(lm_data)
    all_sets = []
    for fold_ in range(NUM_FOLDS):
        val_set = _to_instances(
            lm_data[len(lm_data) * fold_ // NUM_FOLDS:len(lm_data) *
                    (fold_ + 1) // NUM_FOLDS])
        with open('lm-{}-of-{}.pkl'.format(fold_, NUM_FOLDS), 'wb') as f:
            pkl.dump(val_set, f)
        all_sets.extend(val_set)
    return all_sets, vocab
Exemple #23
0
 def __init__(self):
     self.spacy = get_spacy_model('en_core_web_sm', False, False, False)
 def __init__(self, language: str = "en_core_web_sm") -> None:
     self.spacy = get_spacy_model(language, False, False, False)
Exemple #25
0
import pandas as pd
from tqdm import tqdm
from allennlp.common.util import get_spacy_model

USE_ONLY_GOLD_EXAMPLES = False

spacy_model = get_spacy_model("en_core_web_sm",
                              pos_tags=False,
                              parse=False,
                              ner=False)


def _tokenize(sent):
    return ' '.join([x.orth_.lower() for x in spacy_model(sent)])


for split in ('train', 'val', 'test'):
    df = pd.read_csv('../../data/{}.csv'.format(split))
    df['distractor-3'].fillna('', inplace=True)
    if USE_ONLY_GOLD_EXAMPLES and split == 'train':
        oldsize = df.shape[0]
        df = df[df['gold-source'].str.startswith('gold')]
        print("Going from {} -> {} items in train".format(
            oldsize, df.shape[0]))

    with open(
            '{}-{}.txt'.format(
                split, 'goldonly' if USE_ONLY_GOLD_EXAMPLES else 'genalso'),
            'w') as f:
        for _, item in tqdm(df.iterrows()):
            # num_distractors = 4 if (len(item['distractor-3']) != 0 and split == 'train') else 3
Exemple #26
0
 def __init__(self,
              language: str = 'en_core_web_lg',
              pos_tags: bool = False,
              parse: bool = False,
              ner: bool = False) -> None:
     self.spacy = get_spacy_model(language, pos_tags, parse, ner)
 def __init__(self,
              language: str = 'en_core_web_sm',
              pos_tags: bool = False,
              parse: bool = False,
              ner: bool = False) -> None:
     self.spacy = get_spacy_model(language, pos_tags, parse, ner)
Exemple #28
0
def load_visual_madlibs(split):
    """
    Loads the Visual Madlibs dataset, including captions from COCO as the premises
    :return:
    """
    # Let's make sure each contains a verb.
    spacy = get_spacy_model("en_core_web_sm",
                            pos_tags=True,
                            parse=True,
                            ner=False)
    from nltk.tokenize.moses import MosesDetokenizer
    # from pattern.en import conjugate, verbs, PRESENT
    detokenizer = MosesDetokenizer()

    def _sentence_contains_verb(sent):
        spacy_parse = spacy(sent)
        for tok in spacy_parse:
            if tok.pos_ == 'VERB' and tok.lemma_ not in ('is', 'has'):
                return True
        return False

    def order_sents(sent_list):
        sentence_has_verb = np.array(
            [_sentence_contains_verb(s) for i, s in enumerate(sent_list)])
        sentence_length = np.array([len(x) for x in sent_list])

        sentence_score = sentence_has_verb.astype(
            np.float32) + sentence_length / sentence_length.max()
        best_to_worst = np.argsort(-sentence_score).tolist()
        return [sent_list[i] for i in best_to_worst]

    futures_fn = {
        'train': 'tr_futures.json',
        'val': 'val_easy_multichoice_futures.json',
        'test': 'val_hard_multichoice_futures.json',
    }[split]
    key = {
        'train': 'tr_futures',
        'val': "multichoice_futures",
        'test': "multichoice_futures",
    }[split]

    id2futureandpast = defaultdict(lambda: {
        'captions': [],
        'future': [],
        'past': []
    })
    # with open(os.path.join(DATA_PATH, 'visualmadlibs', 'tr_pasts.json'), 'r') as f:
    #     for item in json.load(f)['tr_pasts']:
    #         id2futureandpast[item['image_id']]['past'] = order_sents(item['fitbs'])
    with open(os.path.join(DATA_PATH, 'visualmadlibs', futures_fn), 'r') as f:
        for item in json.load(f)[key]:
            if split == 'train':
                id2futureandpast[item['image_id']]['future'] = order_sents(
                    item['fitbs'])
            else:
                id2futureandpast[item['image_id']]['future'] = [item['pos']]

    with open(os.path.join(DATA_PATH, 'coco', 'dataset_coco.json'), 'r') as f:
        imgid2caps = {
            item['cocoid']:
            ([sent['raw'] for sent in item['sentences']], item['split'])
            for item in json.load(f)['images']
        }

    vml = []
    for k in tqdm(id2futureandpast):
        for cap, future in zip(order_sents(imgid2caps[k][0]),
                               id2futureandpast[k]['future']):
            # Spacy parse the future sentence, change to present tense
            spacy_parse = [(x.orth_, x.pos_, x.dep_) for x in spacy(future)]

            # If there's a ROOT that doesn't start with ing, parse that
            is_match = False
            for i, (word, pos, dep) in enumerate(spacy_parse):
                if pos == 'VERB' and dep == 'ROOT' and not word.endswith(
                        'ing'):
                    spacy_parse[i] = (conjugate(word, tense=PRESENT), pos, dep)
                    is_match = True

            # Else convert AUXes
            if not is_match:
                for i, (word, pos, dep) in enumerate(spacy_parse):
                    if pos == 'VERB' and dep == 'aux':
                        spacy_parse[i] = (conjugate(word,
                                                    tense=PRESENT), pos, dep)

            future_fixed = detokenizer.detokenize([x[0] for x in spacy_parse],
                                                  return_str=True)
            print("{} -> {}".format(future, future_fixed), flush=True)

            future_fixed = future_fixed[0].capitalize() + future_fixed[1:]
            vml.append({'id': k, 'sentences': [cap, future_fixed]})

    # ABANDON THIS FOR NOW.

    #
    #     id2futureandpast[k]['captions'] = (order_sents(imgid2caps[k][0]), imgid2caps[k][1])
    #
    #
    # # Join everything
    # vml = []
    # for id, val in id2futureandpast.items():
    #     vml.append({'id': id, 'sentences': ['{} Afterwards, {}.'.format(cap, future)
    #                                         for cap, future in zip(
    #             val['captions'][0], val['future'])]})
    # import ipdb
    # ipdb.set_trace()
    return vml