Esempi in Python per Alphabet.Alphabet, esempi in Python per alphabet.Alphabet.Alphabet

Esempio n. 1

0

Mostra file

File: data.py Progetto: foxlf823/ADEnormer

    def __init__(self, opt):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        self.train_texts = None
        self.train_Ids = None
        self.dev_texts = None
        self.dev_Ids = None
        self.test_texts = None
        self.test_Ids = None

        self.pretrain_word_embedding = None
        self.word_emb_dim = opt.word_emb_dim

        self.config = self.read_config(opt.config)
        self.feat_config = None

        the_item = 'ner_feature'
        if the_item in self.config:
            self.feat_config = self.config[the_item]  ## [POS]:{emb_size:20}
            self.feature_alphabets = []
            self.feature_emb_dims = []
            for k, v in self.feat_config.items():
                self.feature_alphabets.append(Alphabet(k))
                self.feature_emb_dims.append(int(v['emb_size']))

Esempio n. 2

0

Mostra file

 def __init__(self):
     super(VsmNormer, self).__init__()
     self.word_alphabet = Alphabet('word')
     self.embedding_dim = None
     self.word_embedding = None
     self.dict_alphabet = Alphabet('dict')
     self.dict_embedding = None
     self.gpu = opt.gpu

Esempio n. 3

0

Mostra file

def load_config_pos(config_path, char_embedd_dim):
    max_sent_length, max_char_length, num_labels, embedd_dim_concat = load_config(config_path)
    alphabet_char = Alphabet('char', keep_growing=False)
    alphabet_char.load(config_path, 'alphabet_char')
    alphabet_label = Alphabet('label', keep_growing=False)
    alphabet_label.load(config_path, 'alphabet_label')
    scale = np.sqrt(3.0 / char_embedd_dim)
    char_embedd_table = np.random.uniform(-scale, scale, [alphabet_char.size(), char_embedd_dim]).\
        astype(theano.config.floatX)
    return max_sent_length, max_char_length, num_labels, embedd_dim_concat, alphabet_char, alphabet_label, \
           char_embedd_table

Esempio n. 4

0

Mostra file

    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"  ## "LSTM"/"CNN"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_average_batch_loss = False
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 50
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0

Esempio n. 5

0

Mostra file

File: game.py Progetto: CourseraStudent58/Chategories

 def __init__(self):
     self.categories = Categories()
     self.categories.load()
     self.alphabet = Alphabet()
     self.alphabet.load()
     self.responses = []
     self.nextRound()

Esempio n. 6

0

Mostra file

File: datasource.py Progetto: dtbinh/A2DI_PDenis_6

 def __init__(self,
              data,
              encoding="utf-8",
              feature_alphabet=None,
              alphabet_pop=True,
              alphabet_lock=True,
              sep=":",
              bias=False,
              bias_prefix="@@BIAS@@"):
     Source.__init__(self, data, encoding=encoding)
     self._Instance = BinaryClassificationInstance
     if feature_alphabet != None:
         self._feature_alphabet = feature_alphabet
     else:
         self._feature_alphabet = Alphabet(locked=False)
     self._sep = sep
     self._bias = bias
     self._bias_prefix = bias_prefix
     if alphabet_pop:
         self._populate_alphabet()
     if alphabet_lock:
         self.lock_alphabet()
     else:
         self.unlock_alphabet()
     return

Esempio n. 7

0

Mostra file

    def initial_feature_alphabets(self):
        for l in open(self.train_dir, 'r').readlines():
            if not l.startswith("#") and not l.startswith("-BOS-"):
                items = l.strip("\n").split()
                break

        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = items[idx].split(']', 1)[0] + "]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)

        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [self.HP_feature_default_size
                                 ] * self.feature_num
        #self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_norm']

Esempio n. 8

0

Mostra file

File: nfa.py Progetto: rubnsbarbosa/theory-of-computation

 def __init__(self):
     self.states = State()
     self.sigma = Alphabet()
     self.delta = list()
     self.delta_nfa = list()
     self.initial_state = None
     self.final_state = list()

Esempio n. 9

0

Mostra file

File: LipReading.py Progetto: lipreading/Networks

 def forward(self,Y,h,c, outEncoder,teacher_force):# Y это кол-во символов умножить на 256
     if (np.random.rand()>teacher_force):
         seq_len=Y.shape[0]-1
         output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48)))
         Y = self.embedding(Y)
         for  i in range(len(Y)-1): # -1 так как sos не учитывем в criterion
             h[0],c[0] = self.lstm1(Y[i],(h[0].clone(),c[0].clone()))
             h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone()))
             h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone()))
             h2=h[2].clone()
             context = self.attention(h2, outEncoder,BATCH_SIZE)
             context =  torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) )
            # print("context",context.shape) # torch sueeze
             output_decoder[i] = self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 ))    
     else:
         seq_len=Y.shape[0]-1
         output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48)))
         alphabet = Alphabet()
         Y_cur = self.embedding( load_to_cuda(Variable(torch.LongTensor([alphabet.ch2index('<sos>')]))) ).view(1,self.hidden_size)
         for  i in range(seq_len-1):
             Y_cur=Y_cur.expand(BATCH_SIZE,self.hidden_size)
             h[0],c[0] = self.lstm1(Y_cur,(h[0].clone(),c[0].clone()))
             h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone()))
             h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone()))
             h2 = h[2].clone()
             context = self.attention(h2, outEncoder,BATCH_SIZE)
             context = torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) )
             output_decoder[i]  =  self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 ))
             argmax = torch.max(output_decoder[i][0],dim=0)
             Y_cur=self.embedding( Variable(load_to_cuda(torch.LongTensor([argmax[1][0].data[0]]))) ).view(1,self.hidden_size)
     return output_decoder

Esempio n. 10

0

Mostra file

 def initial_feature_alphabets(self):
     items = open(self.train_dir, 'r').readline().strip('\n').split()
     print items
     total_column = len(items)
     if total_column > 2:
         for idx in range(1, total_column - 1):
             feature_prefix = items[idx].split(']', 1)[0] + "]"
             print "feature_prefix:{}".format(feature_prefix)
             self.feature_alphabets.append(Alphabet(feature_prefix))
             self.feature_name.append(feature_prefix)
             print "Find feature: ", feature_prefix
     self.feature_num = len(self.feature_alphabets)
     self.pretrain_feature_embeddings = [None] * self.feature_num
     self.feature_emb_dims = [20] * self.feature_num
     self.feature_emb_dirs = [None] * self.feature_num
     self.norm_feature_embs = [False] * self.feature_num
     self.feature_alphabet_sizes = [0] * self.feature_num
     if self.feat_config:
         for idx in range(self.feature_num):
             if self.feature_name[idx] in self.feat_config:
                 self.feature_emb_dims[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_size']
                 self.feature_emb_dirs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_dir']
                 self.norm_feature_embs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_norm']

Esempio n. 11

0

Mostra file

    def __init__(self):
        self.name2id = {}  # preferred name -> id
        self.id2name = {}  # id -> CTD_Term
        self.altid2id = {}  # alternative id -> id

        if opt.method == 'cla':
            self.id_alphabet = Alphabet('id')

Esempio n. 12

0

Mostra file

 def test_given_alphabet_as_int_returns_error(self):
     test_data = 123456
     try:
         Alphabet('Test', test_data)
         self.assertFalse(True, "Expected exception")
     except:
         return

Esempio n. 13

0

Mostra file

File: main.py Progetto: sharanya405/enigma

def main():
    UPPER_STRING = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    testAlphabet = Alphabet(UPPER_STRING)
    permutation1 = Permutation(
        "(AELTPHQXRU) (BKNW) (CMOY) (DFG) (IV) (JZ) (S)", testAlphabet)
    permutation2 = Permutation(
        "(FIXVYOMW) (CDKLHUP) (ESZ) (BJ) (GR) (NT) (A) (Q)", testAlphabet)
    permutation3 = Permutation("(ABDHPEJT) (CFLVMZOYQIRWUKXSG) (N)",
                               testAlphabet)
    permutation4 = Permutation("(AEPLIYWCOXMRFZBSTGJQNH) (DV) (KU)",
                               testAlphabet)
    permutation5 = Permutation(
        "(AE) (BN) (CK) (DQ) (FU) (GY) (HW) (IJ) (LO) (MP) (RX) (SZ) (TV)",
        testAlphabet)

    rotor1 = Rotor("I", permutation1, "TG")
    rotor2 = Rotor("II", permutation2, "A")
    rotor3 = Rotor("III", permutation3, "B")
    rotor4 = Rotor("IV", permutation4, "XO")
    reflector = Reflector("A", permutation5)

    rotors = [reflector, rotor4, rotor3, rotor2, rotor1]

    machine = Machine(testAlphabet, 5, 6, rotors)
    machine.insertRotors(["A", "IV", "III", "II", "I"])
    machine.setRotors("AAAA")

    message = input("What to convert:")
    print(machine.convertMsg(message))

Esempio n. 14

0

Mostra file

File: data.py Progetto: harvardchen/sentence_representation_library

    def __init__(self, args):

        # Alphabet
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        # data
        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.input_size = 0

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None

        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0

        # hyper parameters
        self.HP_word_emb_dim = args.embedding_size
        self.HP_char_emb_dim = args.char_embedding_size
        self.HP_iteration = args.max_epoch
        self.HP_batch_size = args.batch_size
        self.HP_char_hidden_dim = args.char_hidden_dim
        self.HP_hidden_dim = args.hidden_size
        self.HP_dropout = args.dropout
        self.HP_char_dropout = args.char_dropout
        self.HP_use_char = True if args.char_encoder else False
        self.HP_char_features = args.char_encoder
        self.HP_gpu = torch.cuda.is_available() and args.gpu
        self.HP_lr = args.lr
        self.HP_model_name = args.model_name
        self.HP_encoder_type = args.encoder
        self.HP_optim = args.optim
        self.HP_number_normalized = args.number_normalized
        self.HP_seed = args.seed
        self.HP_l2 = args.l2
        self.HP_kernel_size = args.kernel_size
        self.HP_kernel_num = args.kernel_num

Esempio n. 15

0

Mostra file

 def test_cross_off_adds_guessed_letter_to_list_of_guessed_letters(self):
     # arrange
     alphabet = Alphabet()
     letter = "a"
     # act
     alphabet.cross_off(letter)
     # assert
     assert letter in alphabet.guessed_letters

Esempio n. 16

0

Mostra file

 def test_already_guessed_returns_true_if_letter_guessed(self):
     # arrange
     alphabet = Alphabet()
     letter = "h"
     alphabet.cross_off(letter)
     # act
     result = alphabet.already_guessed("h")
     # assert
     assert result is True

Esempio n. 17

0

Mostra file

def decode_all(manifests):
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
        keep_transcription_text=True)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    alphabet = Alphabet(args.vocab_path)
    ds2_model.logger.info("start decoding with extended output...")
    ds2_model.init_ext_scorer(args.alpha, args.beta,
                              args.lang_model_path, args.trie_path,
                              alphabet)

    for audioname, manifest_path, duration, offset in manifests:
        try:
            duration_f = float(duration)
            if duration_f < 1.:
                yield (audioname, manifest_path,
                       None, duration, offset)
                continue
        except (TypeError, ValueError):
            pass
        batch_reader = data_generator.batch_reader_creator(
            manifest_path=manifest_path,
            batch_size=args.num_samples,
            min_batch_size=1,
            sortagrad=False,
            shuffle_method=None)

        for decode_data in batch_reader():
            probs_split = ds2_model.infer_batch_probs(
                infer_data=decode_data,
                feeding_dict=data_generator.feeding)

            # note: we only perform single file decoding
            result_transcript = ds2_model.decode_beam_search(
                probs_split=probs_split,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                alphabet=alphabet)

            yield (audioname, manifest_path,
                   result_transcript, duration, offset)

Esempio n. 18

0

Mostra file

 def __init__(self, input_file):
     self.original_data = open(input_file, 'r').readlines()
     self.index_data = []
     self.word_alphabet = Alphabet('word')
     self.gloss_alphabet = Alphabet('gloss')
     self.entity_alphabet = Alphabet('entity')
     self.gaz_alphabet = Alphabet('gaz')
     self.label_alphabet = Alphabet('label')
     self.word_alphabet_size = 0
     self.gloss_alphabet_size = 0
     self.entity_alphabet_size = 0
     self.gaz_alphabet_size = 0
     self.label_alphabet_size = 0
     ### hyperparameters
     self.HP_iteration = 100
     self.HP_batch_size = 1
     self.HP_gaz_hidden_dim = 50
     self.HP_lstm_hidden_dim = 200
     self.HP_dropout = 0.5
     self.gaz_dropout = 0.5
     self.HP_lstm_layer = 1
     self.HP_bilstm = False
     self.HP_use_entity = False
     self.HP_use_gloss = True
     self.HP_use_gaz = False
     self.HP_gpu = True
     self.HP_lr = 0.015
     self.HP_lr_decay = 0.05
     self.HP_clip = 5.0
     self.HP_momentum = 0
     self.HP_iteration = 100
     # embedding hyperparameter
     self.word_emb_dim = 200
     self.entity_emb_dim = 50
     self.gloss_features = "CNN"  #["CNN","LSTM"]
     self.gloss_emb_dim = 200
     self.gloss_hidden_dim = 300
     self.pretrain_word_embedding = np.array([])
     self.pretrain_gaz_embedding = None
     self.word_embed_path = "../LOVECC/NYM.6B.200d.txt"  #"NYM_200.txt"
     self.gaz_embed_path = None
     self.gaz_emb_dim = 200
     self.HP_fix_gaz_emb = True

Esempio n. 19

0

Mostra file

def get_word(seq): # seq-числа
    #print(seq)
    alphabet=Alphabet()
    s=""
    if len(seq)==0:
        return s
    for el in seq:
        #print("el:",el.data)
        s+=alphabet.index2ch(el)
    return s

Esempio n. 20

0

Mostra file

 def test_str_representation_does_not_show_hidden_letters(self):
     # arrange
     alphabet = Alphabet()
     word = Word(alphabet)
     word.word_to_guess = "aardvark"
     word.guess_letter("a")
     # act
     hidden_word = str(word)
     # assert
     assert hidden_word == "aa___a__"

Esempio n. 21

0

Mostra file

File: prepare_lm_data.py Progetto: lsj72123/drop-rnn

def make_alphabet():
    alphabet = Alphabet(0)
    load_dataset("%s/%s.train.txt" % (data, dataset), alphabet)
    load_dataset("%s/%s.valid.txt" % (data, dataset), alphabet)
    if dataset == 'ptb':
        load_dataset("%s/%s.test.txt" % (data, dataset), alphabet)
    # add all the words in all three dataset

    print("%s: total %d words" % (dataset, len(alphabet)))
    pickle.dump(alphabet, open("%s/alphabet.pkl" % data, "wb"))

Esempio n. 22

0

Mostra file

File: preprocess.py Progetto: zjms/SentenceClassifcation

 def __init__(self, args):
     self.config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
     if args.dataset not in self.config['data_list']:
         raise KeyError("No such dataset named {}.".format(args.dataset))
     self.config['dataset'] = args.dataset
     self.datatype = 'binary'
     if self.config['dataset'] in self.config['datatype']['train_test']:
         self.datatype = 'train_test'
     self.alphabet = Alphabet('word')
     self.set_seed()

Esempio n. 23

0

Mostra file

    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 512
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.word_alphabet.add(START)
        self.word_alphabet.add(UNKNOWN)
        self.char_alphabet.add(START)
        self.char_alphabet.add(UNKNOWN)
        self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label')
        self.tagScheme = "NoSeg"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.word_emb_dim = 50
        self.pretrain_word_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_batch_size = 10
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0
        self.HP_clip = 5.0
        self.HP_momentum = 0

Esempio n. 24

0

Mostra file

def str_to_id():
    global alphabet_pos, alphabet_chunk, alphabet_tag

    alphabet_pos = Alphabet('pos')
    train_pos_id = map_string_2_id(train_pos, alphabet_pos)
    alphabet_pos.close()
    dev_pos_id = map_string_2_id(dev_pos, alphabet_pos)
    test_pos_id = map_string_2_id(test_pos, alphabet_pos)

    alphabet_chunk = Alphabet('chunk')
    train_chunk_id = map_string_2_id(train_chunk, alphabet_chunk)
    alphabet_chunk.close()
    dev_chunk_id = map_string_2_id(dev_chunk, alphabet_chunk)
    test_chunk_id = map_string_2_id(test_chunk, alphabet_chunk)

    alphabet_tag = Alphabet('tag')
    train_tag_id = map_string_2_id(train_tag, alphabet_tag)
    alphabet_tag.close()
    dev_tag_id = map_string_2_id(dev_tag, alphabet_tag)
    test_tag_id = map_string_2_id(test_tag, alphabet_tag)

Esempio n. 25

0

Mostra file

    def init_alphabet(self):
        # Read entire train/val/test data to deterimine set of unique characters we should have in alphabet
        unique_chars = set()

        for split in ['train', 'validation', 'test']:
            for entry in self.data_desc[split]:
                for char in entry['trans'].split():
                    unique_chars.add(char)

        # Now add CTC blank as first letter in alphabet. Also sort alphabet lexigraphically for convinience
        self.alphabet = Alphabet(['<ctc-blank>', *sorted(unique_chars)])

Esempio n. 26

0

Mostra file

    def initial_feature_alphabets(self):

        feature_prefix = '[Cap]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 0

        feature_prefix = '[POS]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 1

        self.feature_num = len(self.feature_alphabets)
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']

Esempio n. 27

0

Mostra file

File: utilities_tgdd.py Progetto: caochanhduong/server_ner_predict

def map_string_2_id_open(string_list, name):
    string_id_list = []
    alphabet_string = Alphabet(name)
    for strings in string_list:
        ids = []
        for string in strings:
            id = alphabet_string.get_index(string)
            ids.append(id)
        string_id_list.append(ids)
    alphabet_string.close()
    return string_id_list, alphabet_string

Esempio n. 28

0

Mostra file

File: parse_tweets.py Progetto: fionawaser/deep-hashtagprediction

def main():
    data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz'
    output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy'
    output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy'
    outdir = 'parsed_tweets'

    alphabet_words = Alphabet(start_feature_id=0)
    alphabet_words.add('UNKNOWN_WORD_IDX')
    alphabet_words.add('DUMMY_WORD_IDX')
    dummy_word_idx = DUMMY_WORD_IDX

    alphabet_hashtags = Alphabet(start_feature_id=0)
    alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX')

    inp = 'train'
    store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))
    inp = 'test'
    store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))

    cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w'))
    cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))

Esempio n. 29

0

Mostra file

    def time_stamp_calc(self):
        time = floor(self.creation_time)
        # choosing a seed value to compare, in this case, the date im writing this code
        seed = 10012019
        alpha = Alphabet()
        alpha.shuffle()
        index = (time % seed)
        # digit_one = alpha[temp_time % seed]
        # alpha = alpha.shuffle()
        # digit_two = alpha[temp_time % seed]

        return index

Esempio n. 30

0

Mostra file

File: preprocess.py Progetto: unikcc/PyTorch-BERT-NER

 def __init__(self):
     config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
     config = AttrDict(config)
     self.config = config
     self.tokenizer = BertTokenizer.from_pretrained(config.bert_path,
                                                    do_lower_case=False)
     if not os.path.exists(self.config.data_dir):
         os.makedirs(self.config.data_dir)
     self.CLS, self.SEP = config.CLS, config.SEP
     self.label_alphabet = Alphabet('label',
                                    padflag=False,
                                    unkflag=False,
                                    init_list=['O', self.CLS, self.SEP])