Python Dictionary.Dictionary Exemples, flair.data.Dictionary.Dictionary Python Exemples

Exemple #1

0

Afficher le fichier

    def from_corpus(cls, corpus: Corpus, grammar: SupertagGrammar,
                    parameters: ModelParameters):
        """ Construct an instance of the model using
            * supertags and pos tags from `grammar`, and
            * word embeddings (as specified in `parameters`) from `corpus`.
        """
        supertags = Dictionary(add_unk=False)
        for tag in grammar.tags:
            supertags.add_item(tag.pos())
        postags = Dictionary(add_unk=False)
        for tag in grammar.pos:
            postags.add_item(tag)

        rnn_droupout = parameters.lstm_dropout
        if rnn_droupout < 0:
            rnn_droupout = parameters.dropout

        sequence_tagger = SequenceMultiTagger(
            parameters.lstm_size,
            EmbeddingFactory(parameters, corpus), [supertags, postags],
            ["supertag", "pos"],
            use_rnn=(parameters.lstm_layers > 0),
            rnn_layers=parameters.lstm_layers,
            dropout=parameters.dropout,
            word_dropout=parameters.word_dropout,
            locked_dropout=parameters.locked_dropout,
            lstm_dropout=rnn_droupout,
            reproject_embeddings=False)

        return cls(sequence_tagger, grammar)

Exemple #2

0

Afficher le fichier

def load_task(data_folder):
    X = {'train': [], 'test': []}
    y = {'train': [], 'test': []}
    tag_dictionary = Dictionary()
    tag_dictionary.add_item('<START>')
    tag_dictionary.add_item('<STOP>')

    for part in ('train', 'test'):
        dataset = load_file(data_folder, f'{part}.txt')

        for sentence in dataset.split('\n\n'):
            X_sentence = []
            y_sentence = []

            for tagged_token in sentence.split('\n'):
                if not tagged_token:
                    continue
                token, _, _, tag = re.split(' ', tagged_token)
                if not token.startswith("-DOCSTART-"):
                    X_sentence.append(token)
                    y_sentence.append(tag)
                    tag_dictionary.add_item(tag)

            if X_sentence:
                X[part].append(X_sentence)
                y[part].append(y_sentence)

    return X['train'], X['test'], y['train'], y['test'], tag_dictionary

Exemple #3

0

Afficher le fichier

Fichier : densest_subgraph_disambiguator.py Projet : ChristophAlt/mEx

    def load_from_cooccurrence_file(cls, coocurrence_file, min_occurrence=0):
        id_dictionary = Dictionary(add_unk=True)

        row = []
        col = []

        print('Loading coocurrences, this might take a while...')
        with open(coocurrence_file, 'r') as cooc_file:
            for line in cooc_file:
                if not line.startswith('#'):
                    n_occurrence, cui1, cui2 = line.strip().split('\t')

                    if int(n_occurrence) > min_occurrence:
                        cui1 = id_dictionary.add_item(cui1)
                        cui2 = id_dictionary.add_item(cui2)

                        row.append(cui1)
                        col.append(cui2)

                        row.append(cui2)
                        col.append(cui1)

        print('n connections:', len(row))

        n_cui = len(id_dictionary)

        print(n_cui)
        data = [1 for _ in range(len(row))]
        cooccurrences = coo_matrix((data, (row, col)),
                                   dtype=np.bool,
                                   shape=(n_cui, n_cui)).tocsr()

        print("> finished!")

        return cls(cooccurrences, id_dictionary)

Exemple #4

0

Afficher le fichier

Fichier : test_data.py Projet : azawalich/flair

def test_dictionary_get_item_for_index():
    dictionary = Dictionary(add_unk=False)
    dictionary.add_item('class_1')
    dictionary.add_item('class_2')
    dictionary.add_item('class_3')
    item = dictionary.get_item_for_index(0)
    assert ('class_1' == item)

Exemple #5

0

Afficher le fichier

Fichier : test_data.py Projet : azawalich/flair

def test_dictionary_get_idx_for_item():
    dictionary = Dictionary(add_unk=False)
    dictionary.add_item('class_1')
    dictionary.add_item('class_2')
    dictionary.add_item('class_3')
    idx = dictionary.get_idx_for_item('class_2')
    assert (1 == idx)

Exemple #6

0

Afficher le fichier

Fichier : tars_model.py Projet : ydwisroad/competitions

    def __init__(
            self,
            task_name: str,
            label_dictionary: Dictionary,
            label_type: str,
            embeddings: str = 'bert-base-uncased',
            num_negative_labels_to_sample: int = 2,
            prefix: bool = True,
            **tagger_args,
    ):
        """
        Initializes a TextClassifier
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of labels you want to predict
        :param embeddings: name of the pre-trained transformer model e.g.,
        'bert-base-uncased' etc
        :param num_negative_labels_to_sample: number of negative labels to sample for each
        positive labels against a sentence during training. Defaults to 2 negative
        labels for each positive label. The model would sample all the negative labels
        if None is passed. That slows down the training considerably.
        :param multi_label: auto-detected by default, but you can set this to True
        to force multi-label predictionor False to force single-label prediction
        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
        :param beta: Parameter for F-beta score for evaluation and training annealing
        """
        super(TARSClassifier, self).__init__()

        from flair.embeddings import TransformerDocumentEmbeddings

        if not isinstance(embeddings, TransformerDocumentEmbeddings):
            embeddings = TransformerDocumentEmbeddings(model=embeddings,
                                                       fine_tune=True,
                                                       layers='-1',
                                                       layer_mean=False,
                                                       )

        # prepare TARS dictionary
        tars_dictionary = Dictionary(add_unk=False)
        tars_dictionary.add_item('False')
        tars_dictionary.add_item('True')

        # initialize a bare-bones sequence tagger
        self.tars_model = TextClassifier(document_embeddings=embeddings,
                                         label_dictionary=tars_dictionary,
                                         label_type=self.static_label_type,
                                         **tagger_args,
                                         )

        # transformer separator
        self.separator = str(self.tars_embeddings.tokenizer.sep_token)
        if self.tars_embeddings.tokenizer._bos_token:
            self.separator += str(self.tars_embeddings.tokenizer.bos_token)

        self.prefix = prefix
        self.num_negative_labels_to_sample = num_negative_labels_to_sample

        # Store task specific labels since TARS can handle multiple tasks
        self.add_and_switch_to_new_task(task_name, label_dictionary, label_type)

Exemple #7

0

Afficher le fichier

Fichier : test_utils.py Projet : rkwojdan/flair35

def test_convert_labels_to_one_hot():
    label_dict = Dictionary(add_unk=False)
    label_dict.add_item(u'class-1')
    label_dict.add_item(u'class-2')
    label_dict.add_item(u'class-3')
    one_hot = convert_labels_to_one_hot([[u'class-2']], label_dict)
    assert (one_hot[0][0] == 0)
    assert (one_hot[0][1] == 1)
    assert (one_hot[0][2] == 0)

Exemple #8

0

Afficher le fichier

def test_dictionary_get_item_for_index():
    dictionary: Dictionary = Dictionary(add_unk=False)

    dictionary.add_item("class_1")
    dictionary.add_item("class_2")
    dictionary.add_item("class_3")

    item = dictionary.get_item_for_index(0)

    assert "class_1" == item

Exemple #9

0

Afficher le fichier

def test_dictionary_get_idx_for_item():
    dictionary: Dictionary = Dictionary(add_unk=False)

    dictionary.add_item("class_1")
    dictionary.add_item("class_2")
    dictionary.add_item("class_3")

    idx = dictionary.get_idx_for_item("class_2")

    assert 1 == idx

Exemple #10

0

Afficher le fichier

Fichier : test_utils.py Projet : zhouyonglong/flair

def init():
    y_true = [[0, 1, 1], [0, 0, 1], [1, 1, 0]]
    y_pred = [[0, 1, 1], [0, 0, 0], [1, 0, 0]]

    labels = Dictionary(add_unk=False)
    labels.add_item('class-1')
    labels.add_item('class-2')
    labels.add_item('class-3')

    return y_true, y_pred, labels

Exemple #11

0

Afficher le fichier

Fichier : test_data.py Projet : azawalich/flair

def test_dictionary_get_items_without_unk():
    dictionary = Dictionary(add_unk=False)
    dictionary.add_item('class_1')
    dictionary.add_item('class_2')
    dictionary.add_item('class_3')
    items = dictionary.get_items()
    assert (3 == len(items))
    assert ('class_1' == items[0])
    assert ('class_2' == items[1])
    assert ('class_3' == items[2])

Exemple #12

0

Afficher le fichier

Fichier : test_data.py Projet : azawalich/flair

def test_dictionary_get_items_with_unk():
    dictionary = Dictionary()
    dictionary.add_item('class_1')
    dictionary.add_item('class_2')
    dictionary.add_item('class_3')
    items = dictionary.get_items()
    assert (4 == len(items))
    assert ('<unk>' == items[0])
    assert ('class_1' == items[1])
    assert ('class_2' == items[2])
    assert ('class_3' == items[3])

Exemple #13

0

Afficher le fichier

Fichier : test_utils.py Projet : bluesea0/ditk

def test_convert_labels_to_one_hot():
    label_dict = Dictionary(add_unk=False)
    label_dict.add_item("class-1")
    label_dict.add_item("class-2")
    label_dict.add_item("class-3")

    one_hot = convert_labels_to_one_hot([["class-2"]], label_dict)

    assert one_hot[0][0] == 0
    assert one_hot[0][1] == 1
    assert one_hot[0][2] == 0

Exemple #14

0

Afficher le fichier

Fichier : test_data.py Projet : azawalich/flair

def test_dictionary_save_and_load():
    dictionary = Dictionary(add_unk=False)
    dictionary.add_item('class_1')
    dictionary.add_item('class_2')
    dictionary.add_item('class_3')
    file_path = 'dictionary.txt'
    dictionary.save(file_path)
    loaded_dictionary = dictionary.load_from_file(file_path)
    assert (len(dictionary) == len(loaded_dictionary))
    assert (len(dictionary.get_items()) == len(loaded_dictionary.get_items()))
    os.remove(file_path)

Exemple #15

0

Afficher le fichier

Fichier : train.py Projet : 73minerva/flair_dep_parser

def make_relations_tag_dictionary(corpus: Corpus,
                                  tag_type='dependency',
                                  special_tags=[]) -> Dictionary:

    tag_dictionary: Dictionary = Dictionary(add_unk=False)
    # for tag in special_tags:
    #     tag_dictionary.add_item(tag)
    for sentence in corpus.get_all_sentences():
        for token in sentence.tokens:
            tag_dictionary.add_item(token.get_tag(tag_type).value)
    return tag_dictionary

Exemple #16

0

Afficher le fichier

def make_tag_dic(corpus, tag_type: str, use_w=False) -> Dictionary:
    # Make the tag dictionary
    tag_dictionary: Dictionary = Dictionary()
    if not use_w:
        tag_dictionary.add_item("O")
    for sentence in corpus.get_all_sentences():
        for token in sentence.tokens:
            token: Token = token
            tag_dictionary.add_item(token.get_tag(tag_type).value)
    tag_dictionary.add_item("<START>")
    tag_dictionary.add_item("<STOP>")
    return tag_dictionary

Exemple #17

0

Afficher le fichier

def test_dictionary_get_items_without_unk():
    dictionary: Dictionary = Dictionary(add_unk=False)

    dictionary.add_item("class_1")
    dictionary.add_item("class_2")
    dictionary.add_item("class_3")

    items = dictionary.get_items()

    assert 3 == len(items)
    assert "class_1" == items[0]
    assert "class_2" == items[1]
    assert "class_3" == items[2]

Exemple #18

0

Afficher le fichier

def test_transformers_keep_tokenizer_when_saving(results_base_path):
    embeddings = TransformerWordEmbeddings(
        "sentence-transformers/paraphrase-albert-small-v2")
    results_base_path.mkdir(exist_ok=True, parents=True)
    initial_tagger_path = results_base_path / "initial_tokenizer.pk"
    reloaded_tagger_path = results_base_path / "reloaded_tokenizer.pk"

    initial_tagger = SequenceTagger(embeddings, Dictionary(), "ner")

    initial_tagger.save(initial_tagger_path)
    reloaded_tagger = SequenceTagger.load(initial_tagger_path)

    reloaded_tagger.save(reloaded_tagger_path)

Exemple #19

0

Afficher le fichier

def init(tasks_base_path) -> Tuple[TaggedCorpus, TextRegressor]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION,
                                            tasks_base_path)

    glove_embedding: WordEmbeddings = WordEmbeddings("glove")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextRegressor(document_embeddings, Dictionary(), False)

    trainer = RegressorTrainer(model, corpus)

    return corpus, model, trainer

Exemple #20

0

Afficher le fichier

def test_dictionary_get_items_with_unk():
    dictionary: Dictionary = Dictionary()

    dictionary.add_item("class_1")
    dictionary.add_item("class_2")
    dictionary.add_item("class_3")

    items = dictionary.get_items()

    assert 4 == len(items)
    assert "<unk>" == items[0]
    assert "class_1" == items[1]
    assert "class_2" == items[2]
    assert "class_3" == items[3]

Exemple #21

0

Afficher le fichier

Fichier : fewshot-moviecomplex-simple-to-conll3.py Projet : pharnisch/sequence-tars-evaluation

def _get_tag_dictionary_no_prefix(tag_dictionary):
    candidate_tag_list = []
    for tag in tag_dictionary.idx2item:
        tag = tag.decode("utf-8")
        prefix, tag_no_prefix = _split_tag(tag)
        if prefix == "B" or prefix == "I":
            candidate_tag_list.append(tag_no_prefix)
    candidate_tag_list = _remove_not_unique_items_from_list(candidate_tag_list)

    tag_dictionary_no_prefix: Dictionary = Dictionary(add_unk=False)
    for tag in candidate_tag_list:
        tag_dictionary_no_prefix.add_item(tag)

    return tag_dictionary_no_prefix

Exemple #22

0

Afficher le fichier

    def __init__(self,
                 corpus: Corpus,
                 field: str = "text",
                 embedding_length: int = 300,
                 min_freq: int = 3,
                 separator: str = '_'):

        super().__init__()
        self.name = "more-hot"
        self.static_embeddings = False
        self.min_freq = min_freq
        self.field = field
        self.separator = separator

        tokens = list(map((lambda s: s.tokens), corpus.train))
        tokens = flatten(tokens)

        if field == "text":
            values = list(map((lambda t: t.text.split(separator)), tokens))
        else:
            values = list(
                map((lambda t: t.get_tag(field).value.split(separator)),
                    tokens))
        values = flatten(values)
        most_common = Counter(values).most_common()

        tokens = []
        for token, freq in most_common:
            if freq < min_freq:
                break
            tokens.append(token)

        self.vocab_dictionary: Dictionary = Dictionary()
        for token in tokens:
            self.vocab_dictionary.add_item(token)

        # max_tokens = 500
        self.__embedding_length = embedding_length

        print(self.vocab_dictionary.idx2item)
        print(f"vocabulary size of {len(self.vocab_dictionary)}")

        # model architecture
        self.embedding_layer = torch.nn.Embedding(len(self.vocab_dictionary),
                                                  self.__embedding_length)
        torch.nn.init.xavier_uniform_(self.embedding_layer.weight)

        self.to(flair.device)

Exemple #23

0

Afficher le fichier

def load_corpus():
    label_dictionary: Dictionary = Dictionary(add_unk=False)
    label_dictionary.multi_label = False

    label_dictionary.add_item('0')
    label_dictionary.add_item('1')

    # this is the folder in which train, test and dev files reside
    data_folder = 'datasets/constrained_classification/k16'

    # load corpus containing training, test and dev data
    corpus: Corpus = ClassificationCorpus(data_folder,
                                          dev_file='fasttext.valid',
                                          train_file='fasttext.train')

    return corpus, label_dictionary

Exemple #24

0

Afficher le fichier

Fichier : text_classification_model.py Projet : x0rzkov/flair

    def _make_ad_hoc_label_dictionary(candidate_label_set: set = None,
                                      multi_label=True) -> Dictionary:
        """
        Creates a dictionary given a set of candidate labels
        :return: dictionary of labels
        """
        label_dictionary: Dictionary = Dictionary(add_unk=False)
        label_dictionary.multi_label = multi_label

        if not isinstance(candidate_label_set, set):
            candidate_label_set = set(candidate_label_set)

        for label in candidate_label_set:
            label_dictionary.add_item(label)

        return label_dictionary

Exemple #25

0

Afficher le fichier

    def add_and_switch_to_new_task(
        self,
        task_name,
        label_dictionary: Union[List, Set, Dictionary, str],
        label_type: str,
        multi_label: bool = True,
        force_switch: bool = False,
    ):
        """
        Adds a new task to an existing TARS model. Sets necessary attributes and finally 'switches'
        to the new task. Parameters are similar to the constructor except for model choice, batch
        size and negative sampling. This method does not store the resultant model onto disk.
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of the labels you want to predict
        :param label_type: string to identify the label type ('ner', 'sentiment', etc.)
        :param multi_label: whether this task is a multi-label prediction problem
        :param force_switch: if True, will overwrite existing task with same name
        """
        if task_name in self._task_specific_attributes and not force_switch:
            log.warning(
                "Task `%s` already exists in TARS model. Switching to it.",
                task_name)
        else:
            # make label dictionary if no Dictionary object is passed
            if isinstance(label_dictionary, Dictionary):
                label_dictionary = label_dictionary.get_items()
            if type(label_dictionary) == str:
                label_dictionary = [label_dictionary]

            # prepare dictionary of tags (without B- I- prefixes and without UNK)
            tag_dictionary = Dictionary(add_unk=False)
            for tag in label_dictionary:
                if tag == "<unk>" or tag == "O":
                    continue
                if tag[1] == "-":
                    tag = tag[2:]
                    tag_dictionary.add_item(tag)
                else:
                    tag_dictionary.add_item(tag)

            self._task_specific_attributes[task_name] = {
                "label_dictionary": tag_dictionary,
                "label_type": label_type,
                "multi_label": multi_label,
            }

        self.switch_to_task(task_name)

Exemple #26

0

Afficher le fichier

Fichier : tars_model.py Projet : ydwisroad/competitions

    def predict_zero_shot(self,
                          sentences: Union[List[Sentence], Sentence],
                          candidate_label_set: Union[List[str], Set[str], str],
                          multi_label: bool = True):
        """
        Method to make zero shot predictions from the TARS model
        :param sentences: input sentence objects to classify
        :param candidate_label_set: set of candidate labels
        :param multi_label: indicates whether multi-label or single class prediction. Defaults to True.
        """

        # check if candidate_label_set is empty
        if candidate_label_set is None or len(candidate_label_set) == 0:
            log.warning("Provided candidate_label_set is empty")
            return

        label_dictionary = Dictionary(add_unk=False)
        label_dictionary.multi_label = multi_label

        # make list if only one candidate label is passed
        if isinstance(candidate_label_set, str):
            candidate_label_set = {candidate_label_set}

        # if list is passed, convert to set
        if not isinstance(candidate_label_set, set):
            candidate_label_set = set(candidate_label_set)

        for label in candidate_label_set:
            label_dictionary.add_item(label)

        # note current task
        existing_current_task = self._current_task

        # create a temporary task
        self.add_and_switch_to_new_task("ZeroShot",
                                        label_dictionary,
                                        '-'.join(label_dictionary.get_items()))

        try:
            # make zero shot predictions
            self.predict(sentences)
        finally:
            # switch to the pre-existing task
            self.switch_to_task(existing_current_task)
            self._drop_task("ZeroShot")

        return

Exemple #27

0

Afficher le fichier

Fichier : run_al_multiprocess.py Projet : brightgems/dl_active_learning_project

def load_task(data_folder, task, tag_column, preprocess):
    X = {'train': [], 'test': []}
    y = {'train': [], 'test': []}
    tag_dictionary = Dictionary()
    tag_dictionary.add_item('<START>')
    tag_dictionary.add_item('<STOP>')

    for part in ('train', 'test'):
        #dataset = load_file(data_folder, task, f'{part}.txt')

        file_path = Path(f'{data_folder}/{task}/{part}.txt')
        print('Loading: ', file_path)

        corpus = ColumnDataset(
            path_to_column_file=file_path,
            column_name_map={
                0: 'text',
                tag_column: 'ner'
            },
            tag_to_bioes=None,
            encoding='utf8',
            comment_symbol=None,
            in_memory=True,
            document_separator_token=None,
        )

        for sent in corpus:
            tokens = [w.text for w in sent]
            if preprocess:
                X[part].append(
                    list(
                        zip(tokens,
                            [nltk.pos_tag([tok])[0][1] for tok in tokens])))
            else:
                X[part].append(tokens)

            labels = [w.get_tag('ner').value for w in sent]
            y[part].append(labels)

            for tag in labels:
                tag_dictionary.add_item(tag)

    print('Train size:', len(X['train']))
    print('Test size:', len(X['test']))

    return X['train'], X['test'], y['train'], y['test'], tag_dictionary

Exemple #28

0

Afficher le fichier

def test_dictionary_save_and_load():
    dictionary: Dictionary = Dictionary(add_unk=False)

    dictionary.add_item("class_1")
    dictionary.add_item("class_2")
    dictionary.add_item("class_3")

    file_path = "dictionary.txt"

    dictionary.save(file_path)
    loaded_dictionary = dictionary.load_from_file(file_path)

    assert len(dictionary) == len(loaded_dictionary)
    assert len(dictionary.get_items()) == len(loaded_dictionary.get_items())

    # clean up file
    os.remove(file_path)

Exemple #29

0

Afficher le fichier

Fichier : single_label_corpora.py Projet : m-stoeckel/FlairMultiLabelTagger

    def make_tag_dictionary(self,
                            _=None,
                            add_unk=True) -> Dict[str, Dictionary]:
        dicts: Dict[str:Dictionary] = {}
        # Make the tag dictionaries
        for base_tag, tag_set in self.tag_sets.items():
            if min(len(self.get_train(base_tag)), len(self.get_dev(base_tag)),
                   len(self.get_test(base_tag))) == 0:
                continue
            tag_dictionary: Dictionary = Dictionary(add_unk=add_unk)
            tag_dictionary.add_item("O")
            for tag in sorted(tag_set):
                tag_dictionary.add_item(tag)
            tag_dictionary.add_item("<START>")
            tag_dictionary.add_item("<STOP>")
            dicts.update({base_tag: tag_dictionary})

        return dicts

Exemple #30

0

Afficher le fichier

Fichier : tars_tagger_model.py Projet : wyp19930313/flair

    def add_and_switch_to_new_task(
        self,
        task_name,
        label_dictionary: Union[List, Set, Dictionary, str],
        tag_type: str = None,
    ):
        """
        Adds a new task to an existing TARS model. Sets necessary attributes and finally 'switches'
        to the new task. Parameters are similar to the constructor except for model choice, batch
        size and negative sampling. This method does not store the resultant model onto disk.
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of the labels you want to predict
        :param multi_label: auto-detect if a corpus label dictionary is provided. Defaults to True otherwise
        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
        """
        if task_name in self._task_specific_attributes:
            log.warning(
                "Task `%s` already exists in TARS model. Switching to it.",
                task_name)
        else:

            # make label dictionary if no Dictionary object is passed
            if isinstance(label_dictionary, Dictionary):
                label_dictionary = label_dictionary.get_items()

            # prepare dictionary of tags (without B- I- prefixes)
            tag_dictionary = Dictionary(add_unk=False)
            for tag in label_dictionary:
                if tag == 'O': continue
                if "-" in tag:
                    tag = tag.split("-")[1]
                    tag_dictionary.add_item(tag)
                else:
                    tag_dictionary.add_item(tag)

            self._task_specific_attributes[task_name] = {
                'tag_dictionary': tag_dictionary,
                'tag_type': tag_type
            }

        self.switch_to_task(task_name)