Esempio n. 1
0
def read_data(filename,filename_v):
    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    texts_gen=[]
#    labels_index['0'] = 0
    diction=dict()
    ret=dict()
    i=0
    with open(filename_v, 'r', encoding="utf-8") as fv:
        for l in fv:
            if i>0:
                data = tf.compat.as_str(l.strip()).split(',')
                diction[int(data[0])]=[data[1],data[2],int(data[3])]
            i += 1

    i = 0
    with open(filename, 'r', encoding="utf-8") as f:

        for line in f:

            if i>0 and len(line)>100:

                id = int(line[:line.find('||')])
                t_w = text_to_word_sequence(line[line.find('||') + 2:])
                text_gen = re.sub('[^a-z^0-9]','',diction[id][0].lower())+' '+re.sub('[^a-z^0-9]','',diction[id][1].lower())
                if len(t_w)>MAX_NB_WORDS_IN_TEXT:
                    for text_i in range(0,len(t_w),MAX_NB_WORDS_IN_TEXT):
                        if text_i+MAX_NB_WORDS_IN_TEXT-1<len(t_w):
                            texts.append(' '.join(t_w[text_i:text_i+MAX_NB_WORDS_IN_TEXT-1]))
                            labels.append(diction[id][2]-1)
                            texts_gen.append(text_gen)
                        elif len(t_w)-text_i>50:
                            texts.append(' '.join(t_w[text_i:len(t_w)]))
                            labels.append(diction[id][2]-1)
                            texts_gen.append(text_gen)
                else:
                    texts.append(' '.join(t_w))
                    labels.append(diction[id][2]-1)
                    texts_gen.append(text_gen)
                labels_index[str(diction[id][2])]=diction[id][2]-1

            i+=1
            if i>=NUM_ROWS_FROM_TEXT :
                break
    return texts,texts_gen,labels,labels_index
Esempio n. 2
0




with open(filename, 'r', encoding="utf-8") as f:
    with open(os.path.join(SAVE_DIR, 'submissionFile'), 'a') as sf:
        with open(os.path.join(SAVE_DIR, 'submissionFile_average'), 'a') as sfa:
            sf.write('ID,class1,class2,class3,class4,class5,class6,class7,class8,class9\n')
            sfa.write('ID,class1,class2,class3,class4,class5,class6,class7,class8,class9\n')
            for line in f:
                if i>0:
                    text= line[line.find('||') + 2:]
                    id =int(line[:line.find('||')])

                    t_w = text_to_word_sequence(text)
                    outputstr_m = str(id)
                    outputstr_a = str(id)
                    predict_list = []
                    # ----------
                    if len(t_w) > MAX_NB_WORDS_IN_TEXT:
                        for text_i in range(0, len(t_w), MAX_NB_WORDS_IN_TEXT):
                            if text_i + MAX_NB_WORDS_IN_TEXT - 1 < len(t_w):
                                t_w_splited=t_w[text_i:text_i + MAX_NB_WORDS_IN_TEXT - 1]
                                predict_list.append(predict(t_w_splited, id, word_index, model, sf,model_shape1))
                            elif len(t_w) - text_i > 50:
                                t_w_splited=t_w[text_i:len(t_w)]
                                predict_list.append(predict(t_w_splited, id, word_index, model, sf,model_shape1))
                    else:
                        predict_list.append(predict(t_w, id, word_index, model, sf,model_shape1))