def __init__(self, config: "TokenizerConfiguration"): self._config = config self._end_tokens = config.end_tokens or [] self._start_tokens = config.start_tokens or [] # We reverse the tokens here because we're going to insert them with `insert(0)` later; # this makes sure they show up in the right order. self._start_tokens.reverse() self.__nlp__ = get_spacy_model(self.config.spacy_model, pos_tags=True, ner=False, parse=False) if config.segment_sentences and not self.__nlp__.has_pipe( self.__SPACY_SENTENCIZER__): try: self.__nlp__.add_pipe(self.__SPACY_SENTENCIZER__) # spacy < 3.0.0 except ValueError: self.__nlp__.add_pipe( self.__nlp__.create_pipe(self.__SPACY_SENTENCIZER__)) if config.text_cleaning is None: self.text_cleaning = TextCleaning() else: self.text_cleaning = TextCleaning.from_params( Params(copy.deepcopy(config.text_cleaning)))
def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None: super().__init__(model, dataset_reader) self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False) self._model._full_output_score = True self._beam_size = 1 self._best_n = 5 self._spearmanr_wel = Welford() self._kendalls_tau_wel = Welford() self._pearsonr_wel = Welford() self._pmr_correct = 0.0 self._pmr_total = 0.0 self._pos_acc_correct = 0.0 self._pos_acc_total = 0.0 self._spearmanr_p_values = [] self._kendalls_tau_p_values = [] self._pearsonr_p_values = [] self._exclude_first = True
def test_read_from_file_reuters_corpus_and_segments_sentences_properly( self, lazy, max_sequence_length): reader = MultiLabelTextClassificationJsonReader( lazy=lazy, segment_sentences=True, max_sequence_length=max_sequence_length) reuters_path = Path( "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl" instances = reader.read(reuters_path) instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) text1 = ( "U.K. GROWING IMPATIENT WITH JAPAN - THATCHER Prime Minister Margaret Thatcher said the" " U.K. Was growing more impatient with Japanese trade barriers and warned that it would" " soon have new powers against countries not offering reciprocal access to their" " markets.") instance1 = {"text": text1, "labels": ["acq", "trade"]} text2 = ( "CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadian oil exports rose 20 pct in 1986 over" " the previous year to 33.96 mln cubic meters, while oil imports soared 25.2 pct to" " 20.58 mln cubic meters, Statistics Canada said. Production, meanwhile, was unchanged" " from the previous year at 91.09 mln cubic feet.") instance2 = {"text": text2, "labels": ["nat-gas", "crude"]} text3 = ( "COFFEE, SUGAR AND COCOA EXCHANGE NAMES CHAIRMAN The New York Coffee, Sugar and Cocoa" " Exchange (CSCE) elected former first vice chairman Gerald Clancy to a two-year term" " as chairman of the board of managers, replacing previous chairman Howard Katz. Katz," " chairman since 1985, will remain a board member.") instance3 = {"text": text3, "labels": ["sugar", "cocoa", "coffee"]} for instance in [instance1, instance2, instance3]: sentences = splitter.split_sentences(instance["text"]) tokenized_sentences: List[List[str]] = [] for sentence in sentences: tokens = [token.text for token in spacy_tokenizer(sentence)] if max_sequence_length: tokens = tokens[:max_sequence_length] tokenized_sentences.append(tokens) instance["tokens"] = tokenized_sentences assert len(instances) == 3 fields = instances[0].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance1["tokens"] assert fields["labels"].labels == instance1["labels"] fields = instances[1].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance2["tokens"] assert fields["labels"].labels == instance2["labels"] fields = instances[2].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance3["tokens"] assert fields["labels"].labels == instance3["labels"]
def __init__( self, language: str = "en_core_web_sm", pos_tags: bool = True, parse: bool = False, ner: bool = False, keep_spacy_tokens: bool = False, split_on_spaces: bool = False, start_tokens: Optional[List[str]] = None, end_tokens: Optional[List[str]] = None, ) -> None: # Save these for use later in the _to_params method self._language = language self._pos_tags = pos_tags self._parse = parse self._ner = ner self._split_on_spaces = split_on_spaces self.spacy = get_spacy_model(self._language, self._pos_tags, self._parse, self._ner) if self._split_on_spaces: self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab) self._keep_spacy_tokens = keep_spacy_tokens self._start_tokens = start_tokens or [] # We reverse the tokens here because we're going to insert them with `insert(0)` later; # this makes sure they show up in the right order. self._start_tokens.reverse() self._is_version_3 = spacy.__version__ >= "3.0" self._end_tokens = end_tokens or []
def __init__(self, language: str = 'en_core_web_sm', pos_tags: bool = False, parse: bool = False, ner: bool = False) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) # Modify default to use white space tokenizer, defined above self.spacy.tokenizer = WhitespaceTokenizer(self.spacy.vocab)
def __init__(self, language: str = 'en_core_web_sm', pos_tags: bool = False, parse: bool = False, ner: bool = False, keep_spacy_tokens: bool = False) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) self._keep_spacy_tokens = keep_spacy_tokens
def __init__( self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm" ) -> None: super().__init__(model, dataset_reader) # We have to use spacy to tokenise our document here, because we need # to also know sentence boundaries to propose valid mentions. self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False)
def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( self, lazy: bool, label_name: str, max_sequence_length: Optional[int]): reader = TextSentimentReader(lazy=lazy, segment_sentences=True, label_name=label_name, max_sequence_length=max_sequence_length) ag_path = Path(DATA_DIR, 'ag_news_corpus_original.jsonl') if label_name == 'text_sentiment': ag_path = Path(DATA_DIR, 'ag_news_corpus.jsonl') instances = reader.read(ag_path) instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach " "Petrino Vows to Have Team Better Prepared. NASHVILLE, " "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss " "at home to Memphis last season is still a painful memory " "for the Cardinals.") instance1 = {"text": text1, "label": "2"} text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York" " Giants' starting quarterback.") instance2 = {"text": text2, "label": "2"} text3 = ("A conference dedicated to online journalism explores the " "effect blogs have on news reporting. Some say they draw " "attention to under-reported stories. Others struggle to " "establish the credibility enjoyed by professionals.") instance3 = {"text": text3, "label": "4"} for instance in [instance1, instance2, instance3]: sentences = splitter.split_sentences(instance['text']) tokenized_sentences: List[List[str]] = [] for sentence in sentences: tokens = [token.text for token in spacy_tokenizer(sentence)] if max_sequence_length: tokens = tokens[:max_sequence_length] tokenized_sentences.append(tokens) instance["tokens"] = tokenized_sentences assert len(instances) == 3 fields = instances[0].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance3["tokens"] assert fields["label"].label == instance3["label"]
def __init__(self, language: str = 'en_core_web_sm', pos_tags: bool = False, parse: bool = False, ner: bool = False, wst: bool = False) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) if wst: self.spacy.tokenizer = WhitespaceTokenizer(self.spacy.vocab)
def __init__( self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm" ) -> None: super().__init__(model, dataset_reader) # We have to use spacy to tokenise our document here, because we need # to also know sentence boundaries to propose valid mentions. self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False) self._word_tokenizer = custom_word_tokenizer() self._word_tokenize = lambda text: [token for token in split_tokens_by_hyphen(self._word_tokenizer.tokenize(text))]
def __init__(self, language: str = 'en_core_web_sm', rule_based: bool = False) -> None: # we need spacy's dependency parser if we're not using rule-based sentence boundary detection. self.spacy = get_spacy_model(language, parse=not rule_based, ner=False, pos_tags=False) if rule_based: # we use `sbd`, a built-in spacy module for rule-based sentence boundary detection. if not self.spacy.has_pipe('sbd'): sbd = self.spacy.create_pipe('sbd') self.spacy.add_pipe(sbd)
def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None: # we need spacy's dependency parser if we're not using rule-based sentence boundary detection. self.spacy = get_spacy_model(language, parse=not rule_based, ner=False, pos_tags=False) if rule_based: # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection. # depending on the spacy version, it could be called 'sentencizer' or 'sbd' sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer" if not self.spacy.has_pipe(sbd_name): sbd = self.spacy.create_pipe(sbd_name) self.spacy.add_pipe(sbd)
def __init__(self, language: str = 'en_core_web_sm', pos_tags: bool = False, parse: bool = False, ner: bool = False, keep_spacy_tokens: bool = False, never_split: List[str] = None) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) if never_split is not None: self.spacy.tokenizer = custom_tokenizer(self.spacy, never_split) self._keep_spacy_tokens = keep_spacy_tokens
def __init__(self, language: str = 'en_core_web_sm', pos_tags: bool = False, parse: bool = False, ner: bool = False, keep_spacy_tokens: bool = False, split_on_spaces: bool = False) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) if split_on_spaces: self.spacy.tokenizer = WhitespaceTokenizer(self.spacy.vocab) self._keep_spacy_tokens = keep_spacy_tokens
def __init__(self, language: str = 'en_core_web_sm', rule_based: bool = False) -> None: # we need spacy's dependency parser if we're not using rule-based sentence boundary detection. self.spacy = get_spacy_model(language, parse=not rule_based, ner=False, pos_tags=False) if rule_based: # we use `sbd`, a built-in spacy module for rule-based sentence boundary detection. if not self.spacy.has_pipe('sbd'): sbd = self.spacy.create_pipe('sbd') self.spacy.add_pipe(sbd)
def new_fn(data: Union[spacy.tokens.doc.Doc, Dict, str]): if not isinstance(data, spacy.tokens.doc.Doc): model = get_spacy_model(language, **kwargs) if isinstance(data, Dict): for key, val in data.items(): if isinstance(val, str): data[key] = model(val) elif isinstance(data, tuple): data = tuple(model(tup) if isinstance(tup, str) else tup for tup in data) elif isinstance(data, str): data = model(data) else: pass return fn(data)
def test_as_array_produces_token_array(self): indexer = SpacyTokenIndexer() nlp = get_spacy_model("en_core_web_sm", parse=False, ner=False) tokens = [t for t in nlp("This is a sentence.")] field = TextField(tokens, token_indexers={"spacy": indexer}) vocab = Vocabulary() field.index(vocab) # Indexer functionality array_dict = indexer.tokens_to_indices(tokens, vocab) assert len(array_dict["tokens"]) == 5 assert len(array_dict["tokens"][0]) == 96 # Check it also works with field lengths = field.get_padding_lengths() array_dict = field.as_tensor(lengths) assert list(array_dict["spacy"]["tokens"].shape) == [5, 96]
def __init__( self, language: str = "en_core_web_sm", pos_tags: bool = False, parse: bool = False, ner: bool = False, keep_spacy_tokens: bool = False, split_on_spaces: bool = False, start_tokens: Optional[List[str]] = None, end_tokens: Optional[List[str]] = None, ) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) if split_on_spaces: self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab) self._keep_spacy_tokens = keep_spacy_tokens self._start_tokens = start_tokens or [] # We reverse the tokens here because we're going to insert them with `insert(0)` later; # this makes sure they show up in the right order. self._start_tokens.reverse() self._end_tokens = end_tokens or []
def load_lsmdc(split): """ Loads LSMDC with <someone> annotations #TODO: investigate filtering things. # 1: all sentences need verbs # 2: filter out all sentences that don't begin with a capital letter (these are often incomplete) # 3. all sentences need objects :return: """ lsmdc = pd.read_csv(os.path.join( DATA_PATH, 'movies', 'LSMDC16_annos_{}.csv'.format({ 'train': 'training', 'val': 'val', 'test': 'test' }[split])), sep='\t', header=None, names=[ 'movie', 'start_aligned', 'end_aligned', 'start_extracted', 'end_extracted', 'sentence' ]) lsmdc['movie'] = lsmdc['movie'].apply( lambda x: '_'.join(x.split('_')[:-1])) del lsmdc['start_extracted'] del lsmdc['end_extracted'] lsmdc['start_aligned'] = _to_time(lsmdc['start_aligned']) lsmdc['end_aligned'] = _to_time(lsmdc['end_aligned']) def _fix_sent(sent): sent1 = remove_allcaps(sent) # This is really minor but if it ends with " ." then change that. sent2 = sent1[:-1].rstrip() + '.' if sent1.endswith(' .') else sent1 return unidecode(sent2) from nltk.tokenize.moses import MosesDetokenizer detokenizer = MosesDetokenizer() spacy = get_spacy_model("en_core_web_sm", pos_tags=True, parse=True, ner=False) def check_if_sent_is_grammatical(sent): if sent[0].islower(): print("{} is not grammatical (lowercase start)".format(sent)) return '' # Sanitize the sentence sent_sanitized = remove_allcaps(sent) # Loop over subsentences to find a good one for sent_parsed in spacy(sent_sanitized).sents: root = sent_parsed.root if root.pos_ != 'VERB': print("{} is not grammatical (noverb)".format(sent)) pass elif sent_parsed[0].orth_ in ('and', 'where', 'when'): print("{} is not grammatical (and)".format(sent)) pass elif sent_parsed[-2].orth_ in ('and', 'then'): print("{} is not grammatical (and then)".format(sent)) pass elif not any( [x.dep_ in ('nsubj', 'nsubjpass') for x in sent_parsed]): print("{} is not grammatical (no subj)".format(sent)) pass else: print('good! {}'.format(sent)) return unidecode( detokenizer.detokenize([x.orth_ for x in sent_parsed], return_str=True)) return '' lsmdc['sentence'] = lsmdc['sentence'].apply(check_if_sent_is_grammatical) lsmdc = lsmdc[lsmdc['sentence'].str.len() > 0] return _lsmdc_to_list(lsmdc)
def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None: super().__init__(model, dataset_reader) # We have to use spacy to tokenise our document here, because we need # to also know sentence boundaries to propose valid mentions. self._spacy = get_spacy_model(language, pos_tags=True, parse=True, ner=False)
import json import spacy from allennlp.predictors.predictor import Predictor from allennlp.models.archival import load_archive from allennlp.common.util import get_spacy_model spacy_ = get_spacy_model('en_core_web_sm', pos_tags=True, parse=True, ner=False) # pass whcih = 0 for both parses, 1 for just depend, 2 for just const def parse(data, which=0): if which != 2: # depend parse darchive = load_archive( "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz" ) dpred = Predictor.from_archive(darchive, 'biaffine-dependency-parser') if which != 1: # const parse carchive = load_archive( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz" ) cpred = Predictor.from_archive(carchive, 'constituency-parser') for d in data: if which != 2: dep = dpred.predict_json({"sentence": d.sentence})
def load_lm_data(fold=None, mode='train'): """ Turns the sequential data into instances. :param split: :return: """ # Get or make vocab spacy_model = get_spacy_model("en_core_web_sm", pos_tags=False, parse=False, ner=False) if os.path.exists('vocabulary'): print( "Loading cached vocab. caution if you're building the dataset again!!!!", flush=True) vocab = Vocabulary.from_files('vocabulary') with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f: lm_data = json.load(f) lm_data = [ data_item for s in ('train', 'val', 'test') for data_item in lm_data[s] ] else: assert fold is None with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f: lm_data = json.load(f) lm_data = [ data_item for s in ('train', 'val', 'test') for data_item in lm_data[s] ] # Manually doing this because I don't want to double count things vocab = Vocabulary.from_instances([ Instance({ 'story': TextField( [ Token(x) for x in ['@@bos@@'] + [x.orth_ for x in spacy_model(sent)] + ['@@eos@@'] ], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True) }) }) for data_item in lm_data for sent in data_item['sentences'] ], min_count={'tokens': 3}) vocab.get_index_to_token_vocabulary('tokens') vocab.save_to_files('vocabulary') print("VOCABULARY HAS {} ITEMS".format( vocab.get_vocab_size(namespace='tokens'))) if all([ os.path.exists('lm-{}-of-{}.pkl'.format(i, NUM_FOLDS)) for i in range(NUM_FOLDS) ]): print("LOADING CACHED DATASET", flush=True) if mode == 'val': with open('lm-{}-of-{}.pkl'.format(fold, NUM_FOLDS), 'rb') as f: print("Loading split{} for {}".format(fold, mode)) instances = pkl.load(f) else: instances = [] for other_fold in range(NUM_FOLDS): if other_fold != fold: with open('lm-{}-of-{}.pkl'.format(other_fold, NUM_FOLDS), 'rb') as f: print("Loading split{} for {}".format( other_fold, mode)) instances += pkl.load(f) return instances, vocab print("MAKING THE DATASET", flush=True) assert fold is None for item in tqdm(lm_data): item['sentences_tokenized'] = [[st.orth_ for st in spacy_model(sent)] for sent in item['sentences']] def _to_instances(data): # flatten this instances = [] for item in data: for s1, s2 in pairwise(item['sentences_tokenized']): instances.append(( Instance({ 'story': TextField( [ Token(x) for x in ['@@bos@@'] + s1 + s2 + ['@@eos@@'] ], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True) }) }), s1, s2, item, )) return instances random.seed(123456) random.shuffle(lm_data) all_sets = [] for fold_ in range(NUM_FOLDS): val_set = _to_instances( lm_data[len(lm_data) * fold_ // NUM_FOLDS:len(lm_data) * (fold_ + 1) // NUM_FOLDS]) with open('lm-{}-of-{}.pkl'.format(fold_, NUM_FOLDS), 'wb') as f: pkl.dump(val_set, f) all_sets.extend(val_set) return all_sets, vocab
def __init__(self): self.spacy = get_spacy_model('en_core_web_sm', False, False, False)
def __init__(self, language: str = "en_core_web_sm") -> None: self.spacy = get_spacy_model(language, False, False, False)
import pandas as pd from tqdm import tqdm from allennlp.common.util import get_spacy_model USE_ONLY_GOLD_EXAMPLES = False spacy_model = get_spacy_model("en_core_web_sm", pos_tags=False, parse=False, ner=False) def _tokenize(sent): return ' '.join([x.orth_.lower() for x in spacy_model(sent)]) for split in ('train', 'val', 'test'): df = pd.read_csv('../../data/{}.csv'.format(split)) df['distractor-3'].fillna('', inplace=True) if USE_ONLY_GOLD_EXAMPLES and split == 'train': oldsize = df.shape[0] df = df[df['gold-source'].str.startswith('gold')] print("Going from {} -> {} items in train".format( oldsize, df.shape[0])) with open( '{}-{}.txt'.format( split, 'goldonly' if USE_ONLY_GOLD_EXAMPLES else 'genalso'), 'w') as f: for _, item in tqdm(df.iterrows()): # num_distractors = 4 if (len(item['distractor-3']) != 0 and split == 'train') else 3
def __init__(self, language: str = 'en_core_web_lg', pos_tags: bool = False, parse: bool = False, ner: bool = False) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner)
def __init__(self, language: str = 'en_core_web_sm', pos_tags: bool = False, parse: bool = False, ner: bool = False) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner)
def load_visual_madlibs(split): """ Loads the Visual Madlibs dataset, including captions from COCO as the premises :return: """ # Let's make sure each contains a verb. spacy = get_spacy_model("en_core_web_sm", pos_tags=True, parse=True, ner=False) from nltk.tokenize.moses import MosesDetokenizer # from pattern.en import conjugate, verbs, PRESENT detokenizer = MosesDetokenizer() def _sentence_contains_verb(sent): spacy_parse = spacy(sent) for tok in spacy_parse: if tok.pos_ == 'VERB' and tok.lemma_ not in ('is', 'has'): return True return False def order_sents(sent_list): sentence_has_verb = np.array( [_sentence_contains_verb(s) for i, s in enumerate(sent_list)]) sentence_length = np.array([len(x) for x in sent_list]) sentence_score = sentence_has_verb.astype( np.float32) + sentence_length / sentence_length.max() best_to_worst = np.argsort(-sentence_score).tolist() return [sent_list[i] for i in best_to_worst] futures_fn = { 'train': 'tr_futures.json', 'val': 'val_easy_multichoice_futures.json', 'test': 'val_hard_multichoice_futures.json', }[split] key = { 'train': 'tr_futures', 'val': "multichoice_futures", 'test': "multichoice_futures", }[split] id2futureandpast = defaultdict(lambda: { 'captions': [], 'future': [], 'past': [] }) # with open(os.path.join(DATA_PATH, 'visualmadlibs', 'tr_pasts.json'), 'r') as f: # for item in json.load(f)['tr_pasts']: # id2futureandpast[item['image_id']]['past'] = order_sents(item['fitbs']) with open(os.path.join(DATA_PATH, 'visualmadlibs', futures_fn), 'r') as f: for item in json.load(f)[key]: if split == 'train': id2futureandpast[item['image_id']]['future'] = order_sents( item['fitbs']) else: id2futureandpast[item['image_id']]['future'] = [item['pos']] with open(os.path.join(DATA_PATH, 'coco', 'dataset_coco.json'), 'r') as f: imgid2caps = { item['cocoid']: ([sent['raw'] for sent in item['sentences']], item['split']) for item in json.load(f)['images'] } vml = [] for k in tqdm(id2futureandpast): for cap, future in zip(order_sents(imgid2caps[k][0]), id2futureandpast[k]['future']): # Spacy parse the future sentence, change to present tense spacy_parse = [(x.orth_, x.pos_, x.dep_) for x in spacy(future)] # If there's a ROOT that doesn't start with ing, parse that is_match = False for i, (word, pos, dep) in enumerate(spacy_parse): if pos == 'VERB' and dep == 'ROOT' and not word.endswith( 'ing'): spacy_parse[i] = (conjugate(word, tense=PRESENT), pos, dep) is_match = True # Else convert AUXes if not is_match: for i, (word, pos, dep) in enumerate(spacy_parse): if pos == 'VERB' and dep == 'aux': spacy_parse[i] = (conjugate(word, tense=PRESENT), pos, dep) future_fixed = detokenizer.detokenize([x[0] for x in spacy_parse], return_str=True) print("{} -> {}".format(future, future_fixed), flush=True) future_fixed = future_fixed[0].capitalize() + future_fixed[1:] vml.append({'id': k, 'sentences': [cap, future_fixed]}) # ABANDON THIS FOR NOW. # # id2futureandpast[k]['captions'] = (order_sents(imgid2caps[k][0]), imgid2caps[k][1]) # # # # Join everything # vml = [] # for id, val in id2futureandpast.items(): # vml.append({'id': id, 'sentences': ['{} Afterwards, {}.'.format(cap, future) # for cap, future in zip( # val['captions'][0], val['future'])]}) # import ipdb # ipdb.set_trace() return vml