Beispiel #1
0
def convert_to_vector(inst_list, tokenizer, MAX_SEQUENCE_LENGTH, delim):
    """
    Prepare the data
    """
    #ids = []
    data = []
    lab = []
    for inst in inst_list:
        txt = aidrtokenize.tokenize(inst.text)
        text = " ".join(txt)
        if (len(txt) < 1):
            print("TEXT SIZE:" + txt)
            continue
        data.append(text)
        lab.append(inst.label)

    le = preprocessing.LabelEncoder()
    yL = le.fit_transform(lab)
    labels = list(le.classes_)

    label = yL.tolist()
    yC = len(set(label))
    yR = len(label)
    y = np.zeros((yR, yC))
    y[np.arange(yR), yL] = 1
    y = np.array(y, dtype=np.int32)

    sequences = tokenizer.texts_to_sequences(data)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', data.shape)
    return data, y, le, labels
def read_train_data(dataFile, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, delim):
    """
    Prepare the data
    """
    data = []
    labels = []
    with open(dataFile, 'rb') as f:
        next(f)
        for line in f:
            line = line.decode(encoding='utf-8', errors='strict')
            line = line.strip()
            if (line == ""):
                continue
            row = line.split(delim)
            txt = row[3].strip().lower()
            txt = aidrtokenize.tokenize(txt)
            label = row[6]
            if (len(txt) < 1):
                print (txt)
                continue
            data.append(txt)
            labels.append(label)

    data_shuf = []
    lab_shuf = []
    index_shuf = list(range(len(data)))
    random.shuffle(index_shuf)
    for i in index_shuf:
        data_shuf.append(data[i])
        lab_shuf.append(labels[i])

    le = preprocessing.LabelEncoder()
    yL = le.fit_transform(lab_shuf)
    labels = list(le.classes_)

    label = yL.tolist()
    yC = len(set(label))
    yR = len(label)
    y = np.zeros((yR, yC))
    y[np.arange(yR), yL] = 1
    y = np.array(y, dtype=np.int32)

    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="OOV_TOK")
    tokenizer.fit_on_texts(data_shuf)
    sequences = tokenizer.texts_to_sequences(data_shuf)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    print('Shape of data tensor:', data.shape)
    return data, y, le, labels, word_index, tokenizer
def read_dev_data(dataFile, tokenizer, MAX_SEQUENCE_LENGTH, delim, train_le):
    """
    Prepare the data
    """
    id_list=[]
    data = []
    labels = []
    with open(dataFile, 'rb') as f:
        next(f)
        for line in f:
            line = line.decode(encoding='utf-8', errors='strict')
            line = line.strip()
            if (line == ""):
                continue
            row = line.split(delim)
            t_id= row[2].strip().lower()
            txt = row[3].strip().lower()
            txt = aidrtokenize.tokenize(txt)
            # txt = remove_stop_words(txt, stop_words)
            label = row[6]
            if (len(txt) < 1):
                print (txt)
                continue
            # if(isinstance(txt, str)):
            data.append(txt)
            labels.append(label)
            id_list.append(t_id)

    print(len(data))
    data_shuf = []
    lab_shuf = []
    index_shuf = list(range(len(data)))
    random.shuffle(index_shuf)
    for i in index_shuf:
        data_shuf.append(data[i])
        lab_shuf.append(labels[i])

    le = train_le  # preprocessing.LabelEncoder()
    yL = le.transform(labels)
    labels = list(le.classes_)

    label = yL.tolist()
    yC = len(set(label))
    yR = len(label)
    y = np.zeros((yR, yC))
    y[np.arange(yR), yL] = 1
    y = np.array(y, dtype=np.int32)

    sequences = tokenizer.texts_to_sequences(data)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', data.shape)
    return data, y, le, labels, word_index,id_list
def graph_dist(tweetlist, model, outFile):
    of = open(outFile, "w")
    model.init_sims(
        replace=True)  # Normalizes the vectors in the word2vec class.
    #rowVector=[]
    index = 0
    for tweetR in tweetlist:
        rVec = aidrtokenize.tokenize(tweetR)
        rVec = [w for w in rVec if w not in stop_words]
        colVector = []
        for tweetC in tweetlist:
            cVec = aidrtokenize.tokenize(tweetC)
            cVec = [w for w in cVec if w not in stop_words]
            distance = model.wmdistance(rVec, cVec)
            colVector.append(distance)
        vector = str(index) + " "
        for val in colVector:
            vector = vector + str(1 - val) + " "
        of.write(vector + "\n")
        #rowVector.append(colVector)
    of.close()
def graph_sim(tweetlist, model, outFile):
    print("Number of tweets to generate the graph: " + str(len(tweetlist)))
    of = open(outFile, "w")
    model.init_sims(
        replace=True)  # Normalizes the vectors in the word2vec class.
    rowVector = []

    for tweetR in tweetlist:
        rVec = aidrtokenize.tokenize(tweetR)
        rVec = [w for w in rVec if w not in stop_words]
        rowVector.append(rVec)
    instance = WmdSimilarity(rowVector, model, num_best=None)
    #index=0;
    print("Writing into the file....")
    for index, colVector in enumerate(instance):
        vector = ""
        for i, val in enumerate(colVector):
            if (index != i and val >= 0.3):
                #print str(index)+" != "+str(i)
                vector = vector + str(i) + " "
        of.write(str(index) + " " + vector.strip() + "\n")
    of.close()
Beispiel #6
0
def text_proprecess(text):
    txt = text.strip().lower()
    txt = give_emoji_free_text(txt)
    txt = aidrtokenize.tokenize(txt)
    return txt
Beispiel #7
0
def read_train_data_multimodal(data_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH,
                               label_index, delim):
    """
    Prepare the data
    """
    data = []
    image_list = []
    lab = []
    with open(data_file, 'rb') as f:
        next(f)
        for line in f:
            line = line.decode(encoding='utf-8', errors='strict')
            line = line.strip()
            if (line == ""):
                continue
            row = line.split(delim)
            txt = row[3].strip()
            image_path = str(row[4].strip())
            label = str(row[int(label_index)])
            txt = aidrtokenize.tokenize(txt)
            text = " ".join(txt)
            if (len(txt) < 1):
                print("TEXT SIZE:" + str(txt))
                continue
            data.append(text)
            lab.append(label)
            image_list.append(image_path)
    counts = Counter(lab)
    print(counts)
    print(len(data))
    data_shuf = []
    lab_shuf = []
    image_list_shuf = []
    index_shuf = range(len(data))
    random.shuffle(index_shuf)
    for i in index_shuf:
        data_shuf.append(data[i])
        lab_shuf.append(lab[i])
        image_list_shuf.append(image_list[i])

    #print(data[0])
    le = preprocessing.LabelEncoder()
    yL = le.fit_transform(lab_shuf)
    labels = list(le.classes_)
    print("training classes: " + " ".join(labels))
    label = yL.tolist()
    yC = len(set(label))
    yR = len(label)
    y = np.zeros((yR, yC))
    y[np.arange(yR), yL] = 1
    y = np.array(y, dtype=np.int32)

    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="OOV_TOK")
    tokenizer.fit_on_texts(data_shuf)
    sequences = tokenizer.texts_to_sequences(data_shuf)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    # labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    # print('Shape of label tensor:', labels.shape)
    # return data,labels,word_index,dim;
    return data, image_list_shuf, y, le, labels, word_index, tokenizer
Beispiel #8
0
def read_dev_data_multimodal(data_file, tokenizer, MAX_SEQUENCE_LENGTH,
                             label_index, delim):
    """
    Prepare the data
    """
    ids = []
    data = []
    image_list = []
    lab = []
    with open(data_file, 'rb') as f:
        next(f)
        for line in f:
            line = line.decode(encoding='utf-8', errors='strict')
            line = line.strip()
            if (line == ""):
                continue
            row = line.split(delim)
            image_id = row[2].strip()
            txt = row[3].strip()
            image_path = str(row[4].strip())
            label = str(row[int(label_index)])
            if (len(txt) < 1):
                print("TEXT SIZE:" + txt)
                continue
            txt = aidrtokenize.tokenize(txt)
            text = " ".join(txt)
            if (len(txt) < 1):
                print("TEXT SIZE:" + txt)
                continue
            data.append(text)
            lab.append(label)
            image_list.append(image_path)
            ids.append(image_id)
    counts = Counter(lab)
    print(counts)
    print(len(data))
    data_shuf = []
    lab_shuf = []
    image_list_shuf = []
    ids_shuf = []
    index_shuf = range(len(data))
    random.shuffle(index_shuf)
    for i in index_shuf:
        data_shuf.append(data[i])
        lab_shuf.append(lab[i])
        image_list_shuf.append(image_list[i])
        ids_shuf.append(ids[i])

    le = preprocessing.LabelEncoder()
    yL = le.fit_transform(lab_shuf)
    labels = list(le.classes_)
    print("training classes: " + " ".join(labels))

    label = yL.tolist()
    yC = len(set(label))
    yR = len(label)
    y = np.zeros((yR, yC))
    y[np.arange(yR), yL] = 1
    y = np.array(y, dtype=np.int32)

    sequences = tokenizer.texts_to_sequences(data_shuf)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    print('Shape of data tensor:', data.shape)
    return data, image_list_shuf, y, le, labels, ids_shuf