Ejemplo n.º 1
0
def text2hash(df):
    df.columns = columnsHead
    df['protocol_type'] = df['protocol_type'].apply(
        lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
    df['service'] = df['service'].apply(
        lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
    df['flag'] = df['flag'].apply(
        lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
Ejemplo n.º 2
0
	def compile_tag_data(self, cat):
		"""Compile data for the given category (indexed in the following list)"""

		n = self.n
		isize = self.isize
		
		if cat == 2:
			return [], [] # no irish data

		text = ""
		with open(self.categories[cat] + "_data") as tf:
			for line in tf:
				text += line.strip() #setup for character level hashing

		raw_data = hashing_trick(text, n, hash_function='md5', lower=False, split=' ')

		#with this data, will cut it into groups of isize (this is just a list)
		# and also make a label for each of these groups with the category index 
		labels = []
		data = []
		for i in range(0, len(raw_data), isize):
			if i + isize < len(raw_data): #in bounds
				data.append(raw_data[i:i+isize])
			else:
				zeros = [0]*(i+isize - len(raw_data)) #pad the rest of the values
				data.append(raw_data[i:len(raw_data)] + zeros)
			labels.append(cat) #an index for each category is training target
		return data, labels #return two arrays 
Ejemplo n.º 3
0
	def compile_data(self, cat):
		"""Compile data for the given category (indexed in the following list)"""

		n = self.n
		isize = self.isize

		text = ""
		for filename in os.listdir("./langs/" + self.categories[cat]):
			with open("./langs/" + self.categories[cat] + "/" + filename) as tf:
				for line in tf:
					for c in line:
						if c.isalpha():
							text += c + " " #setup for character level hashing
						elif c == " ":
							text += "_ "

		raw_data = hashing_trick(text, n, hash_function='md5', lower=False, split=' ')

		#with this data, will cut it into groups of isize (this is just a list)
		# and also make a label for each of these groups with the category index 
		labels = []
		data = []
		for i in range(0, len(raw_data), isize):
			if i + isize < len(raw_data): #in bounds
				data.append(raw_data[i:i+isize])
			else:
				zeros = [0]*(i+isize - len(raw_data)) #pad the rest of the values
				data.append(raw_data[i:len(raw_data)] + zeros)
			labels.append(cat) #an index for each category is training target
		return data, labels #return two arrays 
Ejemplo n.º 4
0
def encode_string(text, num_words=2000):
    return hashing_trick(text,
                         num_words,
                         filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                         lower=True,
                         split=' ',
                         hash_function='md5')
Ejemplo n.º 5
0
def encode(text_list):
    encoded_list = []
    for i in range(len(text_list)):
        text = text_list[i]
        encoded = hashing_trick(text, vocab_size, hash_function='md5')
        encoded_list.append(encoded)
    return (encoded_list)
def read_program():
    vocab=list()
    vocab_one_hot=list()
    vocab_hash=list()
    filter='\n'
    for file in glob.glob('/root/eclipse-workspace/Ranking/src/C/*.c'):
        print(file)
        f = open(file, "r")
        text=f.read()
        
        print(text)
        
        vocab.append(text_to_word_sequence(text,filters=filter,split=' '))
        print(vocab[-1])
        
        
        words = set(vocab[-1])
        vocab_size = len(words)
        print("vocab_size = ",vocab_size)
        
        #One hot encoding
        print('One-hot encoding:')
        vocab_one_hot.append(one_hot(text,round(vocab_size*1.3),filters=filter))
        print(vocab_one_hot[-1])
        
        #Hash encoding
        print('Hash encoding:')
        vocab_hash.append(hashing_trick(text, round(vocab_size*1.3), hash_function='md5',filters=filter))
        print(vocab_hash[-1])
        
        
        print('--------------------------')
Ejemplo n.º 7
0
def text2hash(df, cols, toHash=['service', 'flag', 'protocol_type']):
    df.columns = cols
    for el in toHash:
        df[el] = df[el].apply(
            lambda x: hashing_trick(x,
                                    200,
                                    hash_function='md5',
                                    filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
Ejemplo n.º 8
0
 def test_hashing_trick_md5(self):
     text = 'The cat sat on the mat.'
     encoded = preprocessing_text.hashing_trick(text,
                                                5,
                                                hash_function='md5')
     self.assertEqual(len(encoded), 6)
     self.assertLessEqual(np.max(encoded), 4)
     self.assertGreaterEqual(np.min(encoded), 1)
Ejemplo n.º 9
0
def build_embedding(graph):
    """
    Given a graph, creates word embeddings for the names of all nodes.

    First the nodes in the graph are ordered by their labels. Next, for each
    node in the ordered list of nodes, a one hot encoding is computed for its name,
    which is a list of integers. This list is padded. The padded list of integers for all nodes
    are combines to form a single list of integers, which is returned.

    :param graph: A Graph object describing the input data
    :return: A 1D numpy array of shape (MAX_NODES*EMBEDDING_LENGTH,)
    """

    nodes_list = list(graph.nodes.values())
    sorted_nodes = sorted(nodes_list,
                          key=lambda x: hash_labels_only(
                              labels=x.labels, node_label_hash=NODE_TYPE_HASH))
    embedding = []

    for i in range(MAX_NODES):
        if i < len(sorted_nodes) and "name" in sorted_nodes[i].properties and \
                        sorted_nodes[i].properties["name"] != []:

            # The 'name' property on each node is a list, the current solution is to
            # take the first element.
            name = sorted_nodes[i].properties["name"][0]
            encoded_name = hashing_trick(name, VOCAB_SIZE, hash_simhash)

            if "cmdline" in sorted_nodes[i].properties:
                cmdline = sorted_nodes[i].properties["cmdline"]
                encoded_cmdline = hashing_trick(cmdline, VOCAB_SIZE,
                                                hash_simhash)
            else:
                encoded_cmdline = []

            embedding += [encoded_name, encoded_cmdline]
        else:
            embedding += [[], []]

    padded_embedding = pad_sequences(embedding, maxlen=EMBEDDING_LENGTH)
    combined_embedding = [
        num for sublist in padded_embedding for num in sublist
    ]
    return np.asarray(combined_embedding, dtype=np.int16)
Ejemplo n.º 10
0
 def gethash(self, big_input, max_words=10, hash_mole=20000):
     hashed = np.zeros((len(big_input), max_words), dtype='float32')
     for i in range(len(big_input)):
         j = 0
         for h in text.hashing_trick(big_input[i], hash_mole, hash_function='md5'):
             if j == max_words:
                 print('haiku too long? ', big_input[i])
             hashed[i][j] = h
             j += 1
     return hashed
Ejemplo n.º 11
0
def text2hash(
        df):  #hash all the colums which contains string values in a dataframe
    for el in df.columns:
        if not isNumeric(df[el]):
            #print el, df[el][0]
            df[el] = df[el].apply(lambda x: hashing_trick(
                str(x),
                200,
                hash_function='md5',
                filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
Ejemplo n.º 12
0
 def gethash(self, big_input, max_words=10, hash_mole=20000):
     hashed = np.ones((len(big_input), max_words + 2), dtype='float32')
     for i in range(len(big_input)):
         hashed[i][0] = 0
         j = 1
         for h in text.hashing_trick(big_input[i], hash_mole, hash_function='md5'):
             if j == max_words:
                 print('input too long? ', big_input[i])
                 break
             hashed[i][j] = h
             j += 1
     return hashed
Ejemplo n.º 13
0
def IntEncodeWords(wordlist):
    vocab_size = 200000
    max_length = 10
    #integer encoding the syllables
    encoded_words = [
        hashing_trick(d, vocab_size, hash_function='md5') for d in wordlist
    ]
    #padding to a max length of 10
    padded_words = pad_sequences(encoded_words,
                                 maxlen=max_length,
                                 padding='post')
    return padded_words
def prepare_data(data, classes, vocab):
    inputs = []
    labels = []
    for line, char in zip(*data):
        #inputs.append(one_hot(line, len(vocab)))
        inputs.append(hashing_trick(line, len(vocab), hash_function='md5'))

        one_hot_out = np.zeros(len(classes))
        one_hot_out[classes.index(char)] = 1
        labels.append(one_hot_out)

    return np.asarray(pad_sequences(inputs,
                                    padding='post')), np.asarray(labels)
Ejemplo n.º 15
0
def text_to_ids(text):
    if type(text) == str:
        text = [text]

    encoded_texts = [
        hashing_trick(t, settings.VOCABULARY_SIZE, hash_function='md5')
        for t in text
    ]
    padded_texts = pad_sequences(encoded_texts,
                                 maxlen=settings.PHRASE_MAX_LENGTH,
                                 padding='post')

    return padded_texts
Ejemplo n.º 16
0
def generator_inout(d1, d2, cat1, cat2, batch_size, vocab):
    c1 = True
    c2 = True
    file1 = open(d1, "r")
    file2 = open(d2, "r")
    while True:
        batch_input = []
        batch_output = []
        for i in range(0, batch_size):
            if c1 and c2:
                if random.randint(0, 1) > 0:
                    text = file1.readline()
                    cat = cat1
                    if text == "":
                        c1 = False
                else:
                    text = file2.readline()
                    cat = cat2
                    if text == "":
                        c2 = False
            else:
                if c1:
                    text = file1.readline()
                    cat = cat1
                    if text == "":
                        c1 = False
                if c2:
                    text = file2.readline()
                    cat = cat2
                    if text == "":
                        c2 = False
            if not (c1) and not (c2):
                file1.close()
                file2.close()
                file1 = open(d1, "r")
                file2 = open(d2, "r")
                c1 = True
                c2 = True

            input = hashing_trick(text,
                                  round(vocab * 1.3),
                                  hash_function='md5')
            output = cat

            batch_input += [input]
            batch_output += [output]

        batch_x = pad_sequences(np.array(batch_input), maxlen=14)
        batch_y = to_categorical(np.array(batch_output), num_classes=2)

        yield batch_x, batch_y
Ejemplo n.º 17
0
 def text2HashIntegers(word,
                       max_nb_words,
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                       lower=False,
                       split=" "):
     if not uStr.isValidWord(word):
         return []
     nbs = text.hashing_trick(
         text=word,
         n=max_nb_words,
         hash_function=
         'md5',  # 'md5' is a stable hashing function consistent across different runs
         filters=filters,
         lower=lower,
         split=split)
     return nbs
Ejemplo n.º 18
0
def neroset(request):

    com = list()

    json_file = open("imdb_model.json", "r")
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights("imdb_model.h5")
    loaded_model.compile(loss="binary_crossentropy",
                         optimizer="adam",
                         metrics=["accuracy"])

    for i in range(0, 2):
        text = Comment.text[i]

        sequence = hashing_trick(text,
                                 n=64,
                                 hash_function=None,
                                 filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                 lower=True,
                                 split=' ')
        scores = skipgrams(sequence,
                           vocabulary_size=5000,
                           window_size=32,
                           negative_samples=1.0,
                           shuffle=False,
                           categorical=False,
                           sampling_table=None,
                           seed=None)

        b = scores[1:2]
        kol = 0
        num2 = 0
        for g in b:
            num1 = g
            for c in num1:
                num2 += c
                kol += 1

        k = num2 / kol
        if k >= 0.5:
            com[i] = 'Хорошая'
        else:
            com[i] = 'Плохая'
    return render(request, 'blog/home.html', com)
Ejemplo n.º 19
0
def preprocessing_tokenize(X, num_words=160):
    # _tokenizer = Tokenizer(num_words=NUM_WORDS,
    #                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    #                       lower=True,
    #                       split=" ",
    #                       char_level=False)
    #
    # _tokenizer.fit_on_texts(X)
    # print("There where found {} unique tokens. ".format(len(_tokenizer.word_index)))
    # _X = _tokenizer.texts_to_matrix(X)

    _X = []
    for x in X:
        _X.append(hashing_trick(x, NUM_WORDS, hash_function='md5', split=' '))

    _X = pad_sequences(_X, maxlen=MAX_SEQUENCE_LENGTH)

    return _X
Ejemplo n.º 20
0
	def make_test_data(self, line):
		"""Make data set to predict the languages in a line of finnegans wake."""
		line = line.strip()
		text = ""
		for c in line:
			if c == " ":
				text += "_ " #encode spaces just like test data
			else:
				text += c + " "
		
		raw_data = hashing_trick(text, self.n, hash_function='md5', lower=False, split=' ')

		data = []
		if len(raw_data) < self.isize: #add padding
			data.append(raw_data + [0]*(self.isize-len(raw_data))) #add 0 up to isize
		for i in range(len(raw_data)-self.isize): #get every isize length window
			data.append(raw_data[i:i+self.isize]) #get encodings for this window
		return data
Ejemplo n.º 21
0
def prepareData(vocabulary_size, list_of_intent_dicts):
    x_train, y_train = [], []
    intents = []
    for intent_dict in list_of_intent_dicts:
        intents = intents + [
            intent_dict['intent']
        ] if intent_dict['intent'] not in intents else intents
        for sentence in intent_dict['sentences']:
            sentence_vec = text.hashing_trick(
                sentence,
                vocabulary_size,
                hash_function='md5',
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ')
            x_train.append(sentence_vec)
            y_train.append(
                [1 if i == intent_dict['intent'] else 0 for i in intents])

    return intents, x_train, y_train
    def preprocessing_data_hashing_trick(self, vocab_size=5000, max_len=100):
        print('Preprocessing data...')

        y = self.df['label'].astype('U')
        data_f = self.df['text'].astype('U')

        data_f = self.clear_text(data_f)
        data = []
        for xt in data_f:
            xt = ' '.join(text_to_word_sequence(xt))
            data.append(
                hashing_trick(xt.lower(), vocab_size, hash_function='md5'))

        x_train, x_test, y_train, y_test = train_test_split(data,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=255)
        x_train = sequence.pad_sequences(x_train, maxlen=max_len)
        x_test = sequence.pad_sequences(x_test, maxlen=max_len)

        print('x_train shape:', x_train.shape)
        print('x_test shape:', x_test.shape)

        return x_train, x_test, y_train, y_test
Ejemplo n.º 23
0
vocab_size = len(words)
print(vocab_size)
# integer encode the document
result = one_hot(text, round(vocab_size * 1.3))
print(result)

from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence
# define the document
text = 'The quick brown fox jumped over the lazy dog.'
# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)
# integer encode the document
result = hashing_trick(text, round(vocab_size * 1.3), hash_function='md5')
print(result)

from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!']
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

# summarize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)
Ejemplo n.º 24
0
model = Sequential()
model.add(Dense(1000, activation='relu', input_shape=(14, )))
model.add(Dropout(0.1))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(2, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

model.load_weights("thought_model.h5")

inp = input('enter a sentence: ')
words = inp.split()

perm = permutations(words)

for i in perm:
    preinp = hashing_trick(str(i),
                           round(vocab_size * 1.3),
                           hash_function='md5')
    readyinp = pad_sequences(np.array([preinp]), maxlen=14)
    output = model.predict_classes(readyinp)
    #print(output)
    if output[0] == 0:
        print(i)
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""):
    train = False
    narratives = []
    keywords = []
    
    # Get the xml from file
    root = etree.parse(infile).getroot()

    if dict_keys == None:
        train = True

        # Set up the keys for the feature vector
        dict_keys = ["MG_ID", labelname]
        if checklist in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"]
        elif dem in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"]
        print "dict_keys: " + str(dict_keys)
        #keywords = set([])
        #narrwords = set([])

    print "train: " + str(train)
    print "stem: " + str(stem)
    print "lemma: " + str(lemma)
    # Extract features
    matrix = []
    for child in root:
        features = {}

        if rec_type in featurenames:
            features["CL_" + rec_type] = child.tag

        # CHECKLIST features
        for key in dict_keys:
            if key[0:3] == "CL_":
                key = key[3:]
            item = child.find(key)
            value = "0"
            if item != None:
                value = item.text
            if key == "AlcoholD" or key == "ApplytobaccoD":
                if value == 'N':
                    value = 9
            features[key] = value
            #print "-- value: " + value
            #if key == "MG_ID":
            #    print "extracting features from: " + value

        # KEYWORD features
        if kw_features:
            keyword_string = get_keywords(child)
            # Remove punctuation and trailing spaces from keywords
            words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')]
            # Split keyword phrases into individual words
            for word in words:
                w = word.split(' ')
                words.remove(word)
                for wx in w:
                    words.append(wx.strip().strip('–'))
            keywords.append(" ".join(words))
                
        # NARRATIVE features
        if narr_features or ((not train) and (symp_train in featurenames)):
            narr_string = ""
            item = child.find(element)
            if item != None:
                if item.text != None:
                    narr_string = item.text.encode("utf-8")
                else:
                    print "warning: empty narrative"
                narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
                text = " ".join(narr_words)

                if stem:
                    narr_string = preprocessing.stem(text)
                elif lemma:
                    narr_string = preprocessing.lemmatize(text)
            narratives.append(narr_string.strip().lower())
            #print "Adding narr: " + narr_string.lower()

        # SYMPTOM features
        elif train and (symp_train in featurenames):
            narr_string = ""
            item = child.find("narrative_symptoms")
            if item != None:
                item_text = item.text
                if item_text != None and len(item_text) > 0:
                    narr_string = item.text.encode("utf-8")
                    #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
            narratives.append(narr_string.lower())
            print "Adding symp_narr: " + narr_string.lower()

        # Save features
        matrix.append(features)

    # Construct the feature matrix

    # COUNT or TFIDF features
    if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
        documents = []
        if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
            documents = narratives
            print "narratives: " + str(len(narratives))
        elif kw_count in featurenames or kw_tfidf in featurenames:
            documents = keywords
            print "keywords: " + str(len(keywords))

        # Create count matrix
        global count_vectorizer
        if train:
            print "training count_vectorizer"
            count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords)
            count_vectorizer.fit(documents)
            dict_keys = dict_keys + count_vectorizer.get_feature_names()
        print "transforming data with count_vectorizer"
        count_matrix = count_vectorizer.transform(documents)
        matrix_keys = count_vectorizer.get_feature_names()

        print "writing count matrix to file"
        out_matrix = open(infile + ".countmatrix", "w")
        out_matrix.write(str(count_matrix))
        out_matrix.close()

        # Add count features to the dictionary
        for x in range(len(matrix)):
            feat = matrix[x]
            for i in range(len(matrix_keys)):
                key = matrix_keys[i]
                val = count_matrix[x,i]
                feat[key] = val

        # Convert counts to TFIDF
        if (narr_tfidf in featurenames) or (kw_tfidf in featurenames):
            print "converting to tfidf..."
            print "matrix_keys: " + str(len(matrix_keys))

            # Use the training count matrix for fitting
            if train:
                global tfidfTransformer
                tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer()
                tfidfTransformer.fit(count_matrix)

            # Convert matrix to tfidf
            tfidf_matrix = tfidfTransformer.transform(count_matrix)
            print "count_matrix: " + str(count_matrix.shape)
            print "tfidf_matrix: " + str(tfidf_matrix.shape)

            # Replace features in matrix with tfidf
            for x in range(len(matrix)):
                feat = matrix[x]
                #values = tfidf_matrix[x,0:]
                #print "values: " + str(values.shape[0])
                for i in range(len(matrix_keys)):
                    key = matrix_keys[i]
                    val = tfidf_matrix[x,i]
                    feat[key] = val

        # LDA topic modeling features
        if lda in featurenames:
            global ldaModel
            if train:
                ldaModel = LatentDirichletAllocation(n_topics=num_topics)
                ldaModel.fit(count_matrix)
            lda_matrix = ldaModel.transform(count_matrix)
            for t in range(0,num_topics):
                dict_keys.append("lda_topic_" + str(t))
            for x in range(len(matrix)):
                for y in range(len(lda_matrix[x])):
                    val = lda_matrix[x][y]
                    matrix[x]["lda_topic_" + str(y)] = val

            # TODO: Print LDA topics

    # WORD2VEC features
    elif narr_vec in featurenames:
        print "Warning: using word2vec features, ignoring all other features"

        # Create word2vec mapping
        word2vec, dim = load_word2vec(vecfile)

        # Convert words to vectors and add to matrix
        dict_keys.append(narr_vec)
        global max_seq_len
        max_seq_len = 200
        #if train:
            #max_seq_len = 0
        print "word2vec dim: " + str(dim)
        print "initial max_seq_len: " + str(max_seq_len)
        zero_vec = []
        for z in range(0, dim):
            zero_vec.append(0)
        for x in range(len(matrix)):
            narr = narratives[x]
            #print "narr: " + narr
            vectors = []
            vec = zero_vec
            for word in narr.split(' '):
                if len(word) > 0:
                    #if word == "didnt":
                    #    word = "didn't"
                    if word in word2vec:
                        vec = word2vec[word]
                    vectors.append(vec)
            length = len(vectors)
            if length > max_seq_len:
                #if train:
                #    max_seq_len = length
                vectors = vectors[(-1*max_seq_len):]
            (matrix[x])[narr_vec] = vectors

        # Pad the narr_vecs with 0 vectors
        print "padding vectors to reach maxlen " + str(max_seq_len)
        for x in range(len(matrix)):
            length = len(matrix[x][narr_vec])
            matrix[x]['max_seq_len'] = max_seq_len
            if length < max_seq_len:
                for k in range(0, max_seq_len-length):
                    matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding

    # narr_seq for RNN
    elif narr_seq in featurenames:
        global vocab_size, max_seq_len
        if train:
            dict_keys.append(narr_seq)
            dict_keys.append('vocab_size')
            dict_keys.append('max_seq_len')
            vocab = set()
            for narr in narratives:
                words = narr.split(' ')
                for word in words:
                    vocab.add(word)
            vocab_size = len(vocab)
            max_seq_len = 0

        sequences = []

        # Convert text into integer sequences
        for x in range(len(matrix)):
            narr = narratives[x]
            seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ')
            if len(seq) > max_seq_len:
                max_seq_len = len(seq)
            sequences.append(seq)

        # Pad the sequences
        sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre')
        for x in range(len(matrix)):
            matrix[x]['narr_seq'] = sequences[x]
            matrix[x]['vocab_size'] = vocab_size
            matrix[x]['max_seq_len'] = max_seq_len

    #if arg_rebalance != "":
    #    matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance)
    #    write_to_file(matrix_re, dict_keys, outfile)
    #else:
    data_util.write_to_file(matrix, dict_keys, outfile)
Ejemplo n.º 26
0
 # 'Transportation mode': ''}
 # temp[col] is the mapped value of a specific column (e.g. a string)
 pos_body, list2_body, pos_subj, list2_subj, neg_subj, neu_subj, posi_subj, compound_subj, neg_body, neu_body, posi_body, compound_body = clean(temp["description"])
 temp["posi_subj"] = posi_subj
 temp["neu_subj"] = neu_subj
 temp["neg_subj"] = neg_subj
 temp["compound_subj"] = compound_subj
 temp["posi_body"] = posi_body
 temp["neu_body"] = neu_body
 temp["neg_body"] = neg_body
 temp["compound_body"] = compound_body
 # Hash subject POS
 pos_subj_hash = ""
 for i in pos_subj:
     pos_subj_hash+= hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " "
 temp["pos_subj"] = hashing_trick(pos_subj_hash, 1000000, hash_function='md5')
 # Hash Mail subject
 txt_subj_hash = ""
 for i in list2_subj:
     txt_subj_hash += hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " "
 temp["txt_subj"] = hashing_trick(txt_subj_hash, 1000000, hash_function='md5')
 # Hash main POS
 pos_main_hash = ""
 for i in pos_body:
     pos_main_hash += hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " "
 temp["pos_main"] = hashing_trick(pos_main_hash, 1000000, hash_function='md5')
 # Hash main POS
 txt_main_hash = ""
 for i in list2_body:
     txt_main_hash += hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " "
 temp["txt_main"] = hashing_trick(txt_main_hash, 1000000, hash_function='md5')
Ejemplo n.º 27
0
def test_hashing_trick_md5():
    text = 'The cat sat on the mat.'
    encoded = hashing_trick(text, 5, hash_function='md5')
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 1
Ejemplo n.º 28
0
def test_hashing_trick_hash():
    text = 'The cat sat on the mat.'
    encoded = hashing_trick(text, 5)
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 1
Ejemplo n.º 29
0
def test_hashing_trick_md5():
    text = 'The cat sat on the mat.'
    encoded = hashing_trick(text, 5, hash_function='md5')
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 1
Ejemplo n.º 30
0
def test_hashing_trick_hash():
    text = 'The cat sat on the mat.'
    encoded = hashing_trick(text, 5)
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 1
Ejemplo n.º 31
0
 def test_hashing_trick_md5(self):
     sample_text = "The cat sat on the mat."
     encoded = text.hashing_trick(sample_text, 5, hash_function="md5")
     self.assertLen(encoded, 6)
     self.assertLessEqual(np.max(encoded), 4)
     self.assertGreaterEqual(np.min(encoded), 1)
Ejemplo n.º 32
0
 def tokenize(self, text: str):
     return hashing_trick(text, self.max_words, hash_function='md5')