Esempi in Python per Vocabulary.Vocabulary, esempi in Python per Vocabulary.Vocabulary.Vocabulary

Esempio n. 1

0

Mostra file

    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

Esempio n. 2

0

Mostra file

File: Translator.py Progetto: rokrokss/GCN-RNNG

    def __init__(self, mode, prepocessed, srcVocaThreshold, tgtVocaThreshold,
                 deprelLabelThreshold, printEvery, trainSize, testSize,
                 devSize):
        if prepocessed:
            tgtTrain = './data/processed/train.en'
            actTrain = './data/processed/train.oracle.en'
            tgtDev = './data/processed/dev.en'
            actDev = './data/processed/dev.oracle.en'
            tgtTest = './data/processed/test.en'
            actTest = './data/processed/test.oracle.en'
            srcTrain = './data/processed/train.kr'
            deprelTrain = './data/processed/train.deprel.kr'
            srcDev = './data/processed/dev.kr'
            deprelDev = './data/processed/dev.deprel.kr'
            srcTest = './data/processed/test.kr'
            deprelTest = './data/processed/test.deprel.kr'
        else:
            train_permutation = list(range(0, 99999))
            random.shuffle(train_permutation)
            dev_permutation = list(range(0, 10000))
            random.shuffle(dev_permutation)
            print('Parsing target file into plain sentences & actions...')
            tgtTrain, actTrain = self.conll_to_action('./data/tagged_train.en',
                                                      trainSize,
                                                      train_permutation)
            tgtDev, actDev = self.conll_to_action('./data/tagged_dev.en',
                                                  devSize, dev_permutation)
            print(
                'Parsing source file into plain sentences & dependency relations...'
            )
            srcTrain, deprelTrain = self.conll_to_deprels(
                './data/tagged_train.kr', trainSize, train_permutation)
            srcDev, deprelDev = self.conll_to_deprels('./data/tagged_dev.kr',
                                                      devSize, dev_permutation)

        print('Loading processed data...')
        self.sourceVoc = Vocabulary(srcTrain, srcVocaThreshold, 'lang')
        self.targetVoc = Vocabulary(tgtTrain, tgtVocaThreshold, 'lang')
        self.actionVoc = Vocabulary(actTrain, None, 'action')
        self.deprelVoc = Vocabulary(deprelTrain, deprelLabelThreshold,
                                    'deprel')
        self.trainData = []
        self.devData = []
        self.trainData = self.loadCorpus(srcTrain, tgtTrain, actTrain,
                                         deprelTrain, self.trainData)
        self.devData = self.loadCorpus(srcDev, tgtDev, actDev, deprelDev,
                                       self.devData)
        self.printEvery = printEvery
        print('Loaded...')

Esempio n. 3

0

Mostra file

  def __init__(self, naiveBayesMatrix,beta = -1):
    naiveBayesMatrix = naiveBayesMatrix.todense()

    v = 0 #total Vocabulary words
    for x in range(0,naiveBayesMatrix.shape[0]):
        v = v + naiveBayesMatrix[x,:].sum()

    #B = 1/v
    B = beta
    if(beta == -1): B = 1/v
    alphaMinusOne = B #(a-1)

    #(length of vocab list)
    vocab = Vocabulary()
    vocabListLength = vocab.length

    # (a-1)*(length of vocab list)
    denominatorStatic = alphaMinusOne * vocabListLength

    #(count of Xi in Yk) + (a-1)
    numerator = naiveBayesMatrix + alphaMinusOne

    #P(Xi|Yk)
    for x in range(numerator.shape[0]):
        denominatorDynamic = naiveBayesMatrix[x,:].sum()
        numerator[x,:] *= (1/(denominatorDynamic + denominatorStatic))

    #log2(P(Xi|Yk))
    self.mapmatrix = np.log2(numerator)

Esempio n. 4

0

Mostra file

def getVocabByID(ID):
    from Vocabulary import Vocabulary
    try:
        #theData = ""
        # this returns a Vocabulary item based on the record ID in the database
        conn = sqlite3.connect('FinnVocab.db')

        num2use = str(ID)
        # debugging: print(num2use)
        # need to pass a tuple; if you pass a plain string, the query breaks on anything >=10
        # see http://stackoverflow.com/questions/4409539/pythonsqlite-the-like-query-with-wildcards#4409584
        # for where I found the answer (while working on team project & looking for something totally unrelated, of course! :D )
        cursor = conn.execute(
            "SELECT ID, FINNISH, ENGLISH FROM VOCABULARY WHERE ID = ?;",
            (num2use, ))

        # there should be exactly one result returned
        record = cursor.fetchone()
        theID = record[0]
        theFinn = record[1].strip()
        theEngl = record[2].strip()
        theData = Vocabulary(theID, theFinn, theEngl)
    except sqlite3.Error as e:
        print("Unable to retrieve record number " + str(ID) + ".")
        traceback.print_exc()
        # set theData to None so the program can fail gracefully.
        theData = None
    finally:
        # close the connection
        conn.close()
        return theData

Esempio n. 5

0

Mostra file

File: Util.py Progetto: shrivardhan/project

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i + 1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

Esempio n. 6

0

Mostra file

File: main.py Progetto: kandicst/Image-Captioning

def get_model(name='saved_models/model',
              path=extracted_data_train_dir,
              size=TRAIN_SIZE):
    train_vocab = Vocabulary(train_captions, max_vocab_size)
    wat = [
        torch.tensor(x[1:], dtype=torch.int16)
        for x in train_vocab.encoded_captions
    ]
    padded = pad_sequence(wat).permute(1, 0)

    dataset = MyDataset(enc_captions=padded[:size],
                        image_paths=train_image_paths[:size],
                        data_dir=path + 'vecs/')

    dataloader = DataLoader(dataset=dataset, batch_size=256, num_workers=0)

    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    model = End2End(ENC_INPUT, ENC_OUTPUT, DEC_HID_DIM, DEC_OUTPUT, EMB_DIM,
                    ATTN_DIM, train_vocab, criterion, device)

    model.load_state_dict(torch.load(name))

    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    # optimizer.lr = 0.001

    return model, dataset, dataloader, optimizer

Esempio n. 7

0

Mostra file

    def build_vocabulary(self, threshold):
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                caption.lower())
            if len(tokens) > self.input_maxlen - 2:
                self.input_maxlen = len(tokens) + 2
            counter.update(tokens)

        for id in self.story_data:
            temp_in = 0
            temp_out = 0
            for seq in self.story_data[id]:
                caption = seq[2]
                tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                    caption.lower())
                counter.update(tokens)
                temp_out = temp_out + len(tokens)
                caption_in = self.image_desc[seq[1]]
                tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                    caption_in.lower())
                temp_in = temp_in + len(tokens)
                counter.update(tokens)
            if temp_out > self.output_maxlen - 2:
                self.output_maxlen = temp_out + 2
            if temp_in > self.input_maxlen - 2:
                self.input_maxlen = temp_out + 2

        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)

        f = open("./Dataset/vocabulary.pkl", "wb")
        pickle.dump(vocabulary, f)
        f.close()
        return vocabulary

Esempio n. 8

0

Mostra file

    def __init__(self, corpus = ""):
        self.voc = Vocabulary()
        self.tm = scipy.sparse.dok_matrix((1000,1000), dtype=np.float32)

        self.add_from_text(corpus)
        self.start = UnigramLM(self.voc)
        self.valid = False
        self.sorted_tokens = []

Esempio n. 9

0

Mostra file

    def loadVocabs(self):
        """
        Reads the suffixVocab files from disk an stores them in dictonaries and arries
        """
        self.vocabs = {}

        for path in self.vocabPaths:
            print path
            self.vocabs[path] = Vocabulary(path)

Esempio n. 10

0

Mostra file

File: ReviewVectorizer.py Progetto: cyrilthank/nlp

    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

Esempio n. 11

0

Mostra file

    def __init__(self, image_ids, image_folder_path, mode = 'train', vocab_file = "", vocab_threshold = 5, batch_size = 10):
        assert mode in ['train', 'val', 'test']
        
        self.mode = mode
        self.image_folder_path = image_folder_path
        self.batch_size = batch_size
        
        # Get pre-processed objects
        all_captions_dict = load_obj('captions_dict')
        captions_dict = { image_id: all_captions_dict[image_id] for image_id in image_ids } # only include selected subset of captions

        # Obtain sample of training images
        #self.training_image_ids, captions_dict = get_training_indices(sample_size = sample_size, mode = "balanced_clean")
        
        # self.training_image_ids, self.images_path, self.image_id_dict, captions_dict \
        # = get_data(image_folder_path, annotations_path, sample_size, data_type)

        # Set up vocabulary or load from training set
        if self.mode == 'train':
            self.vocab = Vocabulary(captions_dict)
            print('Vocabulary successfully created')
        elif vocab_file != "":
            self.vocab = vocab_file
            self.word2idx = self.vocab.word2idx
            self.idx2word = self.vocab.idx2word
            #print('Vocabulary successfully loaded')
        else:
            self.vocab = load_obj("vocab")
            self.word2idx = self.vocab.word2idx
            self.idx2word = self.vocab.idx2word
            print('Vocabulary successfully loaded')

        # Batch_size set to 1 if is test
        if self.mode == 'test':
            self.batch_size = 1
        
        # Set up dataset
        self.im_ids = [] # with duplicates for indexing, i.e. if caption 1-5 all correspond to image 8, the im_ids will be [8,8,8,8,8]
        self.captions = []
        self.images = []
        self.captions_len = []
        for im_id, captions_list in captions_dict.items():
            for item in captions_list:
                self.im_ids.append(im_id)
                self.captions.append(item)
                self.captions_len.append(len(nltk.tokenize.word_tokenize(item)))
        
        # Set up paramteres for image feature extraction 
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)),
        ])

Esempio n. 12

0

Mostra file

File: Indexer.py Progetto: Juancard/parallel-and-distributed-IR

 def __init__(self, collection, doStats=False, postingsFile=False):
     self.collection = collection
     self.lexAnalyser = False
     self.calculateStats = doStats
     self.vocabulary = Vocabulary()
     self.postings = DictionaryPostings({})
     self.documents = Documents()
     self.maxFreqInDocs = {}
     #self.positions = DictionaryPostings({})
     if self.calculateStats:
         self.stats = self.getInitStats()

Esempio n. 13

0

Mostra file

def generate_vocabulary(counter, threshold):
    """Generate vocabulary."""
    vocab = Vocabulary()

    # Keep words that have more occurances thatn threshold
    words = sorted([word for word, cnt in counter.items() if cnt >= threshold])

    # Add words to dictionary
    for i, word in enumerate(words):
        vocab.addWord(word)

    return vocab

Esempio n. 14

0

Mostra file

    def build_vocabulary(self, threshold):
        '''if os.path.exists("./Dataset/vocabulary.pkl"):
            f = open("./Dataset/vocabulary.pkl","rb")
            vocabulary = pickle.load(f)
            return vocabulary'''
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)

        for annot in self.coco_desc:
            caption = annot['caption']
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)

        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)

        f = open("./Dataset/vocabulary.pkl", "wb")
        pickle.dump(vocabulary, f)
        f.close()

        return vocabulary

Esempio n. 15

0

Mostra file

File: StoryGenerationDataset.py Progetto: shrivardhan/project

    def build_vocabulary(self, threshold):
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)
        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)
        return vocabulary

Esempio n. 16

0

Mostra file

    def from_dataframe(cls,
                       predictor_df,
                       classifier,
                       cutoff=25):  # GLOVE_MODEL
        """Instantiate the vectorizer from the dataset dataframe

        Args:
            predictor_df (pandas.DataFrame): the predictor dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        if classifier == 'GloVe':
            predictor_vocab = SequenceVocabulary()
        else:
            predictor_vocab = Vocabulary(add_unk=True)

        target_vocab = Vocabulary(add_unk=False)
        max_predictor_length = 0

        # Add targets
        for target in sorted(set(predictor_df.target)):
            target_vocab.add_token(target)

        # Add top words if count > provided count
        word_counts = Counter()
        for index, row in predictor_df.iterrows():
            vector = remove_punctuation(row.predictor)
            max_predictor_length = max(max_predictor_length, len(vector))
            for word in vector:
                word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                predictor_vocab.add_token(word)

        return cls(predictor_vocab, target_vocab,
                   max_predictor_length)  # for CNN

Esempio n. 17

0

Mostra file

    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super().__init__()

        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.embed = torch.nn.Embedding(len(self.vocab), dim_emb)
        self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True)
        self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        self.out = torch.nn.Linear(dim_hid, len(self.vocab))

Esempio n. 18

0

Mostra file

File: utils.py Progetto: azfar-imtiaz/GrammarCorrection

def prepare_training_data(sent_pairs):
    voc = Vocabulary()
    sent_pairs_normalized = []
    for sent_p in sent_pairs:
        # normalize incorrect and correct sentence in pair, and append them to normalized sentence pairs
        incorrect_sent_normalized = normalize_string(sent_p[0])
        correct_sent_normalized = normalize_string(sent_p[1])

        normalized_sents_pair = (
            incorrect_sent_normalized, correct_sent_normalized)
        sent_pairs_normalized.append(normalized_sents_pair)
        # add normalized sentence pair to vocabulary
        voc.add_sentence_pair(normalized_sents_pair)

    return voc, sent_pairs_normalized

Esempio n. 19

0

Mostra file

    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super(Encoder_Decoder, self).__init__()
        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.dim_hid = dim_hid
        self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb)
        # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True)
        self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)

        self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)
        # LSTMの128次元の隠れ層を13次元に変換する全結合層
        self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))

Esempio n. 20

0

Mostra file

File: Cluster.py Progetto: rebelDi/Dictionary_project_SLU

    def get_average_vector_of_sentence(self, sentences_with_word, vectors2D, vocabulary_model):
        average_vector = []
        for sentence in sentences_with_word:
            vectors = []

            words = Vocabulary(self.language).sentence_to_wordlist(sentence)
            for word in words:
                try:
                    vectors.append(vectors2D[vocabulary_model.wv.vocab[word.lower()].index])
                except KeyError:
                    continue

            average_vector.append(np.asarray(vectors).mean(axis=0)) # to take the mean of each column
        self.average_vector = average_vector

        return average_vector

Esempio n. 21

0

Mostra file

File: configuration.py Progetto: rebelDi/Dictionary_project_SLU

def main(word, language, part_of_speech, number_of_clusters):
    # here we need to get sentences either from txt or db file
    sentences = get_corpus_from_txt_file(language)
    # sentences = get_corpus_from_db (language)
    print("Got sentences for language " + language)

    vocabulary = Vocabulary(language)
    print("Created Vocabulary")
    cluster = Cluster(language, number_of_clusters)

    words = vocabulary.make_array_of_words_from_sentences(sentences)
    print("Made arrays for every sentence")
    throne2vec = vocabulary.build_vocabulary(words) # get the trained model (vocabulary)
    print("Trained the model")
    all_word_vectors_matrix_2d = cluster.make_vectors_2D(throne2vec)
    print("Made matrix with vectors")

Esempio n. 22

0

Mostra file

File: Vectorizer.py Progetto: monsterZeng/Attention-Based-Bidirectional-Long-Short-Term-Memory-Networks-for-Relation-Classification

    def from_dataframe(cls, news_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the NREVectorizer
        """
        relation_vocab = Vocabulary()
        for relation in set(news_df.relation):
            relation_vocab.add_token(relation)

        seq_vocab = SequenceVocabulary()
        for sequence in news_df.sequence:
            word_list = list(jieba.cut(sequence, cut_all=False))
            seq_vocab.add_many(word_list)
        return cls(seq_vocab, relation_vocab)

Esempio n. 23

0

Mostra file

    def test_knowledge_CRUD(self):
        tag_id = 0
        property_tag = SemanticTag("Property", "ssn:Poperty")
        measurement_property_tag = SemanticTag(
            "Measurement Property", "ssn:property/MeasurementProperty")
        relation_tag1 = PredicateSemanticTag("is a",
                                             "http://www.w3.org/ns/ssn/#is_a",
                                             property_tag,
                                             measurement_property_tag)
        semantic_net = SemanticNet()
        semantic_net.add_tag(property_tag, tag_id)
        tag_id += 1
        semantic_net.add_tag(measurement_property_tag, tag_id)
        tag_id += 1
        semantic_net.add_predicate(relation_tag1)
        accuracy_tag = SemanticTag(
            "Accuracy", "ssn:property/MeasurementProperty/Accuracy")
        relation_tag2 = PredicateSemanticTag("is a",
                                             "http://www.w3.org/ns/ssn/#is_a",
                                             measurement_property_tag,
                                             accuracy_tag)
        semantic_net.add_tag(accuracy_tag, tag_id)
        semantic_net.add_predicate(relation_tag2)
        vocabulary = Vocabulary(semantic_net, None, None, None, None)
        knowledge = Knowledge(vocabulary, None, None)
        kb = KnowledgeBase(knowledge)
        client = MongoClient()
        db = client.local
        collection = db.knowledge_base
        knowledge_json = jsonpickle.encode(knowledge)
        print("\n" + knowledge_json + "\n")
        print(type(knowledge_json))

        knowledge_bson = bson.son.SON(json.loads(knowledge_json))
        print(type(knowledge_bson))

        semantic_net_json = jsonpickle.encode(semantic_net)
        collection.insert_one(knowledge_bson)

        knowledge_found = collection.find_one(
            bson.son.SON(json.loads(knowledge_json)))

        print("Knowledge retrieved: " + str(knowledge_found))

Esempio n. 24

0

Mostra file

    def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff):
        """
        Instantiate the Vectorizer from the dataset dataframe

        Args:
            dataset_df (pandas.DataFrame): the tweets dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the TwitterVectorizer
        """
        # instantiate the Vocabulary for text column
        text_vocabulary = cls._get_text_vocabulary()

        # instantiate the Vocabulary for target column
        target_vocabulary = Vocabulary(add_unknown_token=False)

        # add elements to Target Vocabulary
        for target in sorted(set(dataset_df.target)):
            target_vocabulary.add_token(target)

        # Tweet Tokenizer to split text into tokens
        tokenizer = TweetTokenizer()

        # add word to the Text Vocabulary, if its frequency > cutoff
        word_counts = Counter()

        # iterate through the dataset
        for text in dataset_df.text:
            # split text into words
            words = tokenizer.tokenize(text)

            # update word_counts for all words in the text
            for word in words:
                word_counts[word] += 1

        # for all extacted words
        for word, count in word_counts.items():
            # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary
            if (word not in string.punctuation) and (count > cutoff):
                # add token to the Vocabulary
                text_vocabulary.add_token(word)

        return cls(text_vocabulary, target_vocabulary)

Esempio n. 25

0

Mostra file

def simple_run():
    """train without k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据，分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
    # simple run without k-fold
    run(word_train, label_train, word_dev, label_dev, vocab, device)

Esempio n. 26

0

Mostra file

File: helper.py Progetto: 911-Arsene-Vlad/Seq2Seq-Chatbot

def initialize_model_from_local(path,
                                hparams,
                                de_tokenize=False,
                                verbose=False):
    start = time.time()

    conn = sqlite3.connect(path)
    c = conn.cursor()
    v = Vocabulary(max_len=hparams['MAX_LEN'])
    v.load_vocab_from_local(c, hparams['VOCAB'], verbose)
    c.close()
    conn.close()

    if de_tokenize: v.de_tokenize_data()
    if verbose: print('Vocabulary created!')

    enc, dec, opt = create_model(hparams)

    print('Time to initialize model {:.2f} min | {:.2f} hrs\n'.format(
        (time.time() - start) / 60, (time.time() - start) / 3600))
    return v, enc, dec, opt

Esempio n. 27

0

Mostra file

File: NGramIndex.py Progetto: chrishefele/kaggle-sample-code

    def __init__(self, ngram_file, insert_word_posn, index_insert_words, min_count):
        
        print "\nNGramIndex: Initializing a new index" 

        self.index_insert_words = index_insert_words
        self.counts_total    = 0
        self.num_uniq_ngrams = 0 
        self.vocab = Vocabulary()

        print "Now examining ngram counts in", ngram_file
        max_lines = self.ngram_file_mincount_line(ngram_file, min_count)
        print "For a min ngram count of ", min_count,
        print "need to read", max_lines, "ngrams from", ngram_file

        self.ngram_hash   = numpy.zeros(max_lines, dtype=numpy.int64)
        self.ngram_count  = numpy.zeros(max_lines, dtype=numpy.int32)
        if index_insert_words:
            self.ngram_gapword = numpy.zeros(max_lines, dtype=numpy.int32)

        self.build_index(ngram_file, insert_word_posn, index_insert_words, 
                         max_lines, min_count)

Esempio n. 28

0

Mostra file

def prep_dataset():
    wiki_path = WIKI_PATH

    if CONTEXT_CAPACITY % 2 != 0:
        raise Exception("Context length should be even")

    context_window = CONTEXT_CAPACITY + 1

    print("Loading...", end="")
    wiki = WikiDataLoader(wiki_path)
    voc = Vocabulary()
    tok = Tokenizer()
    print("done")

    wiki_doc = wiki.next_doc()
    wikiprep = open("WikiPrepData.txt", "w")

    i = 0
    while wiki_doc:
        doc = tok(wiki_doc)
        voc.add(doc)

        sample = np.array(voc.text2ids(doc))
        indexer = np.arange(context_window)[None, :] + np.arange(
            len(sample) - context_window)[:, None]

        smpl = sample[indexer]

        for row in smpl:
            for val in row:
                wikiprep.write("%d " % val)
            wikiprep.write("\n")

        i += 1
        if i == 2000:
            break
        wiki_doc = wiki.next_doc()

    pickle.dump(voc, open("WikiPrepVoc.pkl", "wb"))
    print("Vocabulary ready")

Esempio n. 29

0

Mostra file

File: build_freq_vectors.py Progetto: glee2429/CS539_W21_NLP

def main_freq():

    logging.info("Loading dataset")
    dataset = load_dataset("ag_news")
    dataset_text = [r['text'] for r in dataset['train']]
    dataset_labels = [r['label'] for r in dataset['train']]

    logging.info("Building vocabulary")
    vocab = Vocabulary(dataset_text)
    vocab.make_vocab_charts()
    plt.close()
    plt.pause(0.01)

    logging.info("Computing PPMI matrix")
    PPMI = compute_ppmi_matrix([doc['text'] for doc in dataset['train']],
                               vocab)

    logging.info("Performing Truncated SVD to reduce dimensionality")
    word_vectors = dim_reduce(PPMI)

    logging.info("Preparing T-SNE plot")
    plot_word_vectors_tsne(word_vectors, vocab)

Esempio n. 30

0

Mostra file

def k_fold_run():
    """train with k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据，分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    data = np.load(config.train_dir, allow_pickle=True)
    words = data["words"]
    labels = data["labels"]
    kf = KFold(n_splits=config.n_split)
    kf_data = kf.split(words, labels)
    kf_index = 0
    total_test_loss = 0
    total_f1 = 0
    for train_index, dev_index in kf_data:
        kf_index += 1
        word_train = words[train_index]
        label_train = labels[train_index]
        word_dev = words[dev_index]
        label_dev = labels[dev_index]
        test_loss, f1 = run(word_train, label_train, word_dev, label_dev,
                            vocab, device, kf_index)
        total_test_loss += test_loss
        total_f1 += f1
    average_test_loss = float(total_test_loss) / config.n_split
    average_f1 = float(total_f1) / config.n_split
    logging.info("Average test loss: {} , average f1 score: {}".format(
        average_test_loss, average_f1))