Esempio n. 1
0
def parse(Masterdir, filename, seperator, datacol, labelcol, labels):

    # Reads the files and splits data into individual lines
    f = open(Masterdir + filename, 'r', encoding='UTF-8')
    lines = f.read().lower()
    lines = lines.lower().split('\n')[:-1]
    print(lines)

    X_train = []
    Y_train = []

    # Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator)
        # Token is the function which implements basic preprocessing as mentioned in our paper
        tokenized_lines = token(line[datacol])
        X_train.append(tokenized_lines)
        # Appends labels
        if line[labelcol] == labels[0]:
            Y_train.append(0)
        if line[labelcol] == labels[1]:
            Y_train.append(1)
        if line[labelcol] == labels[2]:
            Y_train.append(2)

    # Converts Y_train to a numpy array
    Y_train = np.asarray(Y_train)

    assert (len(X_train) == Y_train.shape[0])

    return [X_train, Y_train]
Esempio n. 2
0
def parse1(Masterdir1, filename1, seperator1, datacol1, labelcol1, labels1):
    """
    #datacol表示句子
    Purpose -> Data I/O
    Input   -> Data file containing sentences and labels along with the global variables
    Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
    """
    # Reads the files and splits data into individual lines
    #f = open(Masterdir1 + Datadir1 + filename1, 'r',encoding='utf-8')
    f = open('E:/复现/BAKSA_IITK-master/HASOC_Off/Data/task1/翻译test+lable.txt', 'r', encoding='utf-8')
    lines = f.read().lower()
    #print(lines)

    lines = lines.lower().split('\n')[:-1]
    #print(lines)

    X_test = []
    Y_test = []


    # Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator1)
        # Token is the function which implements basic preprocessing as mentioned in our paper
        for l in line:
            re = line[0] + '\n'

        # fd = open('E:/复现/BAKSA_IITK-master/Baseline/Sub-word-LSTM/pred/digt.csv', 'a')
        # fd.write(re)##145条test
        tokenized_lines = token(line[datacol1])
        #print('xxxxxx',line[datacol])#105条
        #print(tokenized_lines)#105条

        # Creates character lists
        char_list = []
        for words in tokenized_lines:
            for char in words:
                char_list.append(char)
            char_list.append(' ')
        #print(char_list) #- Debugs the character list created
        X_test.append(char_list)

        #Appends labels
        if line[labelcol1] == labels1[0]:
            Y_test.append(0)
        if line[labelcol1] == labels1[1]:
            Y_test.append(1)


        # for line in lines:
        #     i=0
        # Y_test.append(i)
        #print(Y_test)
        # Converts Y_train to a numpy array
    Y_test = np.asarray(Y_test)

    assert (len(X_test) == Y_test.shape[0])

    return [X_test, Y_test]
Esempio n. 3
0
def parse(Masterdir, filename, seperator, datacol, labelcol, labels):
    """
    Purpose -> Data I/O
    Input   -> Data file containing sentences and labels along with the global variables
    Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
    """
    # Reads the files and splits data into individual lines
    #f = open(Masterdir + Datadir + filename,'r',encoding='utf-8')
    f = open(
        'E:/复现/BAKSA_IITK-master/HASOC_Off/Data/task1/翻译ml-Hasoc-offensive-train.txt',
        'r',
        encoding='utf-8')
    lines = f.read().lower()
    # print(lines)

    lines = lines.lower().split('\n')[:-1]
    # print(lines)

    X_train = []
    Y_train = []
    D = []

    # Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator)
        # print(line)

        # for l in line:
        #     re = line[0] + '\n'
        # # print(re)
        # fd = open('/home/lab1510/Desktop/Sub-word-LSTM-master/Data/digt.csv', 'w')
        # fd.write(re)

        # Token is the function which implements basic preprocessing as mentioned in our paper
        tokenized_lines = token(line[datacol])

        # Creates character lists
        char_list = []
        for words in tokenized_lines:
            for char in words:
                char_list.append(char)
            char_list.append(' ')
        # print(char_list) #- Debugs the character list created
        X_train.append(char_list)
        # print(X_train)

        # Appends labels
        if line[labelcol] == labels[0]:
            Y_train.append(0)
        if line[labelcol] == labels[1]:
            Y_train.append(1)
    # Converts Y_train to a numpy array
    Y_train = np.asarray(Y_train)
    assert (len(X_train) == Y_train.shape[0])

    return [X_train, Y_train]
def parse(Masterdir, filename, seperator, datacol, labelcol, labels):
    """
    Purpose -> Data I/O
    Input   -> Data file containing sentences and labels along with the global variables
    Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
    """
    # Reads the files and splits data into individual lines
    f = open(Masterdir + filename, 'r', encoding='UTF-8')
    lines = f.read().lower()
    lines = lines.lower().split('\n')[:-1]
    print(lines)

    X_train = []
    Y_train = []

    # Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator)
        # Token is the function which implements basic preprocessing as mentioned in our paper
        tokenized_lines = token(line[datacol])
        # print(tokenized_lines)

        # Creates character lists
        # char_list = []
        # sentence = []
        # for words in tokenized_lines:
        #     for char in words:
        #         char_list.append(char)
        #     sentence.append(char_list)
        # print(sentence)
        # # print(char_list) - Debugs the character list created
        X_train.append(tokenized_lines)
        # print(X_train)

        # Appends labels
        if line[labelcol] == labels[0]:
            Y_train.append(0)
        if line[labelcol] == labels[1]:
            Y_train.append(1)
        if line[labelcol] == labels[2]:
            Y_train.append(2)

    # Converts Y_train to a numpy array
    Y_train = np.asarray(Y_train)
    # print(Y_train)

    assert (len(X_train) == Y_train.shape[0])

    return [X_train, Y_train]
Esempio n. 5
0
def parse(Masterdir, filename, seperator, datacol, labelcol, labels):
    """
	Purpose -> Data I/O
	Input   -> Data file containing sentences and labels along with the global variables
	Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
	"""
    #Reads the files and splits data into individual lines
    f = open(
        'E:/复现/BAKSA_IITK-master/HASOC_Off/Data/task2/Malayalam_offensive_data_Training-YT.txt',
        'r',
        encoding='utf-8')
    lines = f.read().lower()
    lines = lines.lower().split('\n')[:-1]
    lines = lines[1:]

    X_train = []
    Y_train = []

    #Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator)
        #Token is the function which implements basic preprocessing as mentioned in our paper
        tokenized_lines = token(line[datacol])
        # print(tokenized_lines)
        #Creates character lists
        char_list = []
        for words in tokenized_lines:
            for char in words:
                char_list.append(char)
            char_list.append(' ')
        #print(char_list) - Debugs the character list created
        X_train.append(char_list)

        #Appends labels
        if line[labelcol] == labels[0]:
            Y_train.append(0)
        if line[labelcol] == labels[1]:
            Y_train.append(1)

    #Converts Y_train to a numpy array
    Y_train = np.asarray(Y_train)
    assert (len(X_train) == Y_train.shape[0])

    return [X_train, Y_train]
Esempio n. 6
0
def parsetest(Masterdir, filename, seperator, datacol, idlcol):
    # Reads the files and splits data into individual lines
    f = open(Masterdir + filename, 'r', encoding='UTF-8')
    lines = f.read().lower()
    lines = lines.lower().split('\n')[:-1]
    print(lines)

    X_test = []
    id_test = []

    # Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator)
        tokenized_lines = token(line[datacol])
        X_test.append(tokenized_lines)
        id_test.append(line[idlcol])

    return [X_test, id_test]
Esempio n. 7
0
    'r+')
json_string = f.read()
f.close()
model = model_from_json(json_string)

model.load_weights(Masterdir + Modeldir + 'LSTM_' + experiment_details +
                   '_weights.h5')
model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['accuracy'])

while (1):
    inp_sent = raw_input('Enter a sentence. Press \'Q\' to exit.\n')
    if inp_sent == "Q":
        break
    inp_sent = token(inp_sent)
    X_test = []
    temp = []
    for words in inp_sent:
        for char in words:
            temp.append(mapping_char2num[char])
        temp.append(mapping_char2num[' '])
    X_test.append(temp)
    X_test = np.asarray(X_test)
    print(X_test.shape)

    X_test = sequence.pad_sequences(X_test[:], maxlen=200)
    print(X_test.shape)
    #score, acc = model.evaluate(X_test, y_test2, batch_size=batch_size)

    y_pred = model.predict_classes(X_test, batch_size=batch_size)
Esempio n. 8
0
def parse(Masterdir, filename, seperator, datacol, labelcol, labels):
    """
    Purpose -> Data I/O
    Input   -> Data file containing sentences and labels along with the global variables
    Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
    """

    #Reads the files and splits data into individual lines
    f = open(Masterdir + Datadir + filename, 'r')
    lines = f.read().lower()
    lines = lines.lower().split('\n')[:-1]

    # f=codecs.open(Masterdir+Datadir+filename, 'r', 'utf-8')
    # lines=f.read()
    #lines = f.readlines()
    # lines = f.read()
    # lines = to_unicode_repr(lines)
    # print("lines", lines)

    # lines = utf8.get_letters(lines)
    # lines = lines.lower().split('\n')[:-1]
    # words = utf8.get_words(lines)
    #print("words", words)

    # output = tamil.tscii.convert_to_unicode(f.read())
    # print("Output",output)

    # lines = split_content_to_sentences(datacol)
    # print("datacol", words[100])
    # istamil = utf8.is_normalized(words[100])
    # print("istamil", istamil)
    # letters = lines[50]
    # print("letters:", letters[100])
    # u = u'letters[100]'
    # print("Unicode: ",letters[100].encode('utf-8'))
    # print("Unicodetry:",tamil.tscii.convert_to_unicode(letters[100]))
    # # print("uni:", unicode(letters[100], 'utf-8'))
    # # print("Decimal: ",decimal(letters[100].encode('utf-8')))
    # res = istamil_prefix(lines[50])
    # print("res: ", res)
    # reseng = has_english(lines)
    # print("reseng: ", reseng)

    X_train = []
    Y_train = []

    #Processes individual lines
    for line in lines:
        # Seperator for the current dataset. Currently '\t'.
        line = line.split(seperator)
        #Token is the function which implements basic preprocessing as mentioned in our paper
        # print("datacol: ",line[datacol])
        tokenized_lines = token(line[datacol])

        # tokenized_lines = split_content_to_sentences(line[datacol])

        #Creates character lists
        char_list = []
        for words in tokenized_lines:
            for char in words:
                char_list.append(char)
            char_list.append(' ')
        #print(char_list) - Debugs the character list created
        X_train.append(char_list)

        #Appends labels
        if line[labelcol] == labels[0]:
            Y_train.append(0)
        if line[labelcol] == labels[1]:
            Y_train.append(1)
        if line[labelcol] == labels[2]:
            Y_train.append(2)

    #Converts Y_train to a numpy array
    Y_train = np.asarray(Y_train)
    assert (len(X_train) == Y_train.shape[0])

    return [X_train, Y_train]