Esempio n. 1
0
def _load_data(file_path, item2idx, idx_cnt, pad_idx, class_num):

    data = pd.read_csv(file_path, sep='\t', dtype={'itemId': np.int64})
    print("read finish")
    # return
    data.sort_values(['sessionId', 'Time'], inplace=True)  # 按照sessionid和时间升序排列
    print("sort finish")
    # y = list(data.groupby('SessionId'))
    print("list finish")
    # tmp_data = dict(y)

    samplepack = Samplepack()
    samples = []
    now_id = 0
    print("I am reading")
    sample = Sample()
    last_id = None
    click_items = []
    for s_id, item_id in zip(list(data['sessionId'].values),
                             list(data['itemId'].values)):
        if last_id is None:
            last_id = s_id
        if s_id != last_id:
            item_dixes = []
            for item in click_items:
                if item not in item2idx:
                    if idx_cnt == pad_idx:
                        idx_cnt += 1
                    item2idx[item] = idx_cnt
                    idx_cnt += 1
                item_dixes.append(item2idx[item])
            in_dixes = item_dixes[:-1]
            out_dixes = item_dixes[1:]
            sample.id = now_id
            sample.session_id = last_id
            sample.click_items = click_items
            sample.items_idxes = item_dixes
            sample.in_idxes = in_dixes
            sample.out_idxes = out_dixes
            samples.append(sample)
            # print(sample)
            sample = Sample()
            last_id = s_id
            click_items = []
            now_id += 1
        else:
            last_id = s_id
        click_items.append(item_id)
        # click_items = list(tmp_data[session_tmp_idx]['ItemId'])
    sample = Sample()
    item_dixes = []
    for item in click_items:
        if item not in item2idx:
            if idx_cnt == pad_idx:
                idx_cnt += 1
            item2idx[item] = idx_cnt
            idx_cnt += 1
        item_dixes.append(item2idx[item])
    in_dixes = item_dixes[:-1]
    out_dixes = item_dixes[1:]
    sample.id = now_id
    sample.session_id = last_id
    sample.click_items = click_items
    sample.items_idxes = item_dixes
    sample.in_idxes = in_dixes
    sample.out_idxes = out_dixes
    samples.append(sample)
    print(sample)

    samplepack.samples = samples
    samplepack.init_id2sample()
    return samplepack, idx_cnt
Esempio n. 2
0
def _load_data(file_path, item2idx, idx_cnt, pro=None, pad_idx=0):

    data = pd.read_csv(file_path, dtype={'ItemId': np.int64})
    print("read finish")
    # return
    data.sort_values(['SessionId', 'Time'], inplace=True)  # 按照sessionid和时间升序排列
    print("sort finish")
    # y = list(data.groupby('SessionId'))
    print("list finish")
    # tmp_data = dict(y)

    session_data = list(data['SessionId'].values)
    item_event = list(data['ItemId'].values)
    if pro is not None:
        lenth = int(len(session_data) / pro)
        print(lenth)
        session_data = session_data[-lenth:]
        item_event = item_event[-lenth:]
        for i in range(len(session_data)):
            if session_data[i] != session_data[i + 1]:
                break
        session_data = session_data[i + 1:]
        item_event = item_event[i + 1:]
    lenth = len(session_data)
    print(lenth)

    samplepack = Samplepack()
    samples = []
    now_id = 0
    print("I am reading")
    sample = Sample()
    last_id = None
    click_items = []

    for s_id, item_id in zip(session_data, item_event):
        if last_id is None:
            last_id = s_id
        if s_id != last_id:
            item_dixes = []
            for item in click_items:
                if item not in item2idx:
                    if idx_cnt == pad_idx:
                        idx_cnt += 1
                    item2idx[item] = idx_cnt
                    idx_cnt += 1
                item_dixes.append(item2idx[item])
            in_dixes = item_dixes[:-1]
            out_dixes = item_dixes[1:]
            sample.id = now_id
            sample.session_id = last_id
            sample.click_items = click_items
            sample.items_idxes = item_dixes
            sample.in_idxes = in_dixes
            sample.out_idxes = out_dixes
            samples.append(sample)
            # print(sample)
            sample = Sample()
            last_id = s_id
            click_items = []
            now_id += 1
        else:
            last_id = s_id
        click_items.append(item_id)
        # click_items = list(tmp_data[session_tmp_idx]['ItemId'])
    sample = Sample()
    item_dixes = []
    for item in click_items:
        if item not in item2idx:
            if idx_cnt == pad_idx:
                idx_cnt += 1
            item2idx[item] = idx_cnt
            idx_cnt += 1
        item_dixes.append(item2idx[item])
    in_dixes = item_dixes[:-1]
    out_dixes = item_dixes[1:]
    sample.id = now_id
    sample.session_id = last_id
    sample.click_items = click_items
    sample.items_idxes = item_dixes
    sample.in_idxes = in_dixes
    sample.out_idxes = out_dixes
    samples.append(sample)
    print(sample)

    samplepack.samples = samples
    samplepack.init_id2sample()
    return samplepack, idx_cnt
Esempio n. 3
0
        datas.append(cdatas)
        # solve last data
    for i in rand_idx[count:]:
        datas[-1].append(dataset[i])

    tmp = copy.deepcopy(datas)
    for i in xrange(3):
        datas = copy.deepcopy(tmp)
        test_datas = datas.pop(i)
        trian_datas = []
        path = "../datas/3cross/cross" + str(i + 1) + "/"
        for x in datas:
            trian_datas += x
        print len(test_datas)
        print len(trian_datas)
        samplepack_train = Samplepack()
        samplepack_train.samples = trian_datas
        samplepack_train.init_id2sample()
        samplepack_test = Samplepack()
        samplepack_test.samples = test_datas
        samplepack_test.init_id2sample()
        dump_file([samplepack_train, path + mid_dong_train_data],
                  [samplepack_test, path + mid_dong_test_data])

    # for i in xrange(3):
    #     datas = copy.deepcopy(tmp)
    #     train_datas = datas.pop(i)
    #     # train_datas2 = datas.pop(i)
    #     # train_datas = train_datas1+train_datas2
    #     test_datas = []
    #     path = "../datas/3train/cross" + str(i + 1) + "/"
Esempio n. 4
0
def _load_data(file_path, word2idx, idx_cnt, pad_idx, class_num):
    # tree = ET.parse(file_path)
    # root = tree.getroot()
    samplepack = Samplepack()
    samples = []
    now_id = 0
    retdata = []  # the ret
    contexts = []  # the ret
    fullsents = []  # the full sentences.
    aspects = []  # the ret
    labels = []  # the ret
    positons = []  # the ret
    subpositions = []  # the subscript positions.
    rowtexts = []  # the row texts
    rowaspects = []  # the row aspects
    data = open(file_path, 'r')
    tmp = []
    for line in data:
        tmp.append(line)
    for i in xrange(0, len(tmp), 3):
        sample = Sample()
        row_text = tmp[i].lower().strip()
        # left=row_text.split('$t$')[0]
        # row=row_text.split()[1]
        index1 = row_text.index('$t$')

        # rmasp_text = tokenize(left_row_text + " " + right_row_text)
        left_row_text = row_text[0:index1]
        right_row_text = row_text[index1 + 4:]
        left_tk_text = left_row_text.split()
        right_tk_text = right_row_text.split()
        crt_ctx_rmasp = []
        crt_asp = []
        crt_sent = []
        subposition = []
        left_subposition = []
        right_subposition = []
        left_tktext_idxes = []
        right_tktext_idxes = []
        local_idx2word = {}
        # the left part 2 ids.
        for w in left_tk_text:
            if w.lower() not in word2idx:
                if idx_cnt == pad_idx:
                    idx_cnt += 1
                word2idx[w.lower()] = idx_cnt
                idx_cnt += 1
            left_tktext_idxes.append(word2idx[w.lower()])
            local_idx2word[word2idx[w]] = w

        rasp = tmp[i + 1].lower().strip()
        crt_position = [index1, index1 + len(rasp)]
        asps = rasp.split()
        for w in asps:
            if w.lower() not in word2idx:
                if idx_cnt == pad_idx:
                    idx_cnt += 1
                word2idx[w.lower()] = idx_cnt
                idx_cnt += 1
            crt_asp.append(word2idx[w.lower()])
            local_idx2word[word2idx[w]] = w
        # the right part 2 ids.
        for w in right_tk_text:
            if w.lower() not in word2idx:
                if idx_cnt == pad_idx:
                    idx_cnt += 1
                word2idx[w.lower()] = idx_cnt
                idx_cnt += 1
            right_tktext_idxes.append(word2idx[w.lower()])
            local_idx2word[word2idx[w]] = w

        # left + right 2 crt_ctx_rmasp
        crt_ctx_rmasp.extend(left_tktext_idxes)
        crt_ctx_rmasp.extend(right_tktext_idxes)

        left_subposition.append(len(crt_sent))
        crt_sent.extend(left_tktext_idxes)
        left_subposition.append(len(crt_sent))
        subposition.append(len(crt_sent))
        crt_sent.extend(crt_asp)
        right_subposition.append(len(crt_sent))
        subposition.append(len(crt_sent))
        crt_sent.extend(right_tktext_idxes)
        right_subposition.append(len(crt_sent))

        crt_lab = int(tmp[i + 2].strip())
        if crt_lab == -1:
            crt_lab = 1
        elif crt_lab == 1:
            crt_lab = 0
        elif crt_lab == 0:
            crt_lab = 2
        # text = row_text.lower().replace("$t$", tmp[i + 1]).split()
        # crt_ctx = []
        # for w in text:
        #     if w.lower() not in word2idx:
        #         if idx_cnt == pad_idx:
        #             idx_cnt += 1
        #         word2idx[w.lower()] = idx_cnt
        #         idx_cnt += 1
        #     crt_ctx.append(word2idx[w.lower()])

        if crt_lab != 3:
            if class_num == 2 and sample.label2pol(crt_lab) == 'neutral':
                continue
            sample.id = now_id
            sample.sent_id = now_id
            now_id += 1
            sample.aspect = rasp
            sample.text = row_text.replace("$t$", tmp[i + 1].strip())
            sample.aspect_idxes = crt_asp
            sample.text_idxes = crt_sent
            sample.left_context_idxes = left_tktext_idxes
            sample.right_context_idxes = right_tktext_idxes
            sample.context_idxes = crt_ctx_rmasp
            sample.label = crt_lab
            sample.aspect_charpos = crt_position
            sample.aspect_wordpos = subposition
            sample.left_wordpos = left_subposition
            sample.right_wordpos = right_subposition
            sample.local_idx2word = local_idx2word
            samples.append(sample)

        # crt_sent = []
        # subposition = []
        # crt_sent.extend(left_tktext_idxes)
        # subposition.append(len(crt_sent))
        # crt_sent.extend(crt_asp)
        # subposition.append(len(crt_sent) - 1)
        # crt_sent.extend(right_tktext_idxes)
        # # the full sentence. consists of the idxes.
        # crt_sent = []
        # subposition = []
        # crt_sent.extend(left_tktext_idxes)
        # subposition.append(len(crt_sent))
        # crt_sent.extend(crt_asp)
        # subposition.append(len(crt_sent) - 1)
        # crt_sent.extend(right_tktext_idxes)
        # crt_position = [len(left), len(left) + len(rasp)]

        # print crt_pol_text
        # crt_lab = -1
        # if crt_pol_text == "1":
        #     crt_lab = 1
        # elif crt_pol_text == "-1":
        #     crt_lab = 2
        # elif crt_pol_text == "0":
        #     crt_lab = 0
        # else:
        #     crt_lab = 3
        # if crt_lab != 3:
        #             contexts.append(crt_ctx_rmasp)
        #             aspects.append(crt_asp)
        #             positons.append(crt_position)
        #             labels.append(crt_lab)
        #             rowtexts.append(row_text)
        #             rowaspects.append(rasp)
        #             fullsents.append(crt_sent)
        #             subpositions.append(subposition)
    # retdata = [contexts, aspects, labels, positons,
    #            rowtexts, rowaspects, fullsents, subpositions]
    # print  labels
    samplepack.samples = samples
    samplepack.init_id2sample()
    return samplepack, idx_cnt
def _load_data(file_path, word2idx, idx_cnt, pad_idx, class_num):
    tree = ET.parse(file_path)
    root = tree.getroot()

    samplepack = Samplepack()
    samples = []
    now_id = 0

    for sentence in root:
        row_text = sentence.find("text").text.lower()
        sent_id = sentence.get("id")
        for asp_terms in sentence.iter('aspectTerms'):
            # iter the aspects of on sentence.
            for asp_term in asp_terms.findall('aspectTerm'):
                sample = Sample()
                rasp = asp_term.get("term").lower()
                asps = tokenize(rasp)

                crt_pol_text = asp_term.get("polarity")
                crt_from = int(asp_term.get("from"))
                crt_to = int(asp_term.get("to"))
                crt_position = [crt_from, crt_to]

                # remove the aspect from the context. consists of the idxes.
                crt_ctx_rmasp = []
                crt_asp = []
                crt_sent = []
                subposition = []
                left_subposition = []
                right_subposition = []
                left_tktext_idxes = []
                right_tktext_idxes = []
                local_idx2word = {}

                left_row_text = row_text[0:crt_from]
                right_row_text = row_text[crt_to:]
                # rmasp_text = tokenize(left_row_text + " " + right_row_text)
                left_tk_text = tokenize(left_row_text)
                right_tk_text = tokenize(right_row_text)

                # the left part 2 ids.
                for w in left_tk_text:
                    if w not in word2idx:
                        if idx_cnt == pad_idx:
                            idx_cnt += 1
                        word2idx[w] = idx_cnt
                        idx_cnt += 1
                    left_tktext_idxes.append(word2idx[w])
                    local_idx2word[word2idx[w]] = w

                # the aspect.
                for w in asps:
                    if w not in word2idx:
                        if idx_cnt == pad_idx:
                            idx_cnt += 1
                        word2idx[w] = idx_cnt
                        idx_cnt += 1
                    crt_asp.append(word2idx[w])
                    local_idx2word[word2idx[w]] = w

                # the right part 2 ids.
                for w in right_tk_text:
                    if w not in word2idx:
                        if idx_cnt == pad_idx:
                            idx_cnt += 1
                        word2idx[w] = idx_cnt
                        idx_cnt += 1
                    right_tktext_idxes.append(word2idx[w])
                    local_idx2word[word2idx[w]] = w

                # left + right 2 crt_ctx_rmasp
                crt_ctx_rmasp.extend(left_tktext_idxes)
                crt_ctx_rmasp.extend(right_tktext_idxes)

                # the full sentence. consists of the idxes.
                left_subposition.append(len(crt_sent))
                crt_sent.extend(left_tktext_idxes)
                left_subposition.append(len(crt_sent))
                subposition.append(len(crt_sent))
                crt_sent.extend(crt_asp)
                right_subposition.append(len(crt_sent))
                subposition.append(len(crt_sent))
                crt_sent.extend(right_tktext_idxes)
                right_subposition.append(len(crt_sent))

                crt_lab = sample.pol2label(crt_pol_text)

                if crt_lab != 3:
                    if class_num == 2 and sample.label2pol(
                            crt_lab) == 'neutral':
                        continue

                    sample.id = now_id
                    now_id += 1
                    sample.sent_id = sent_id
                    sample.aspect = rasp
                    sample.text = row_text
                    sample.aspect_idxes = crt_asp
                    sample.text_idxes = crt_sent
                    sample.left_context_idxes = left_tktext_idxes
                    sample.right_context_idxes = right_tktext_idxes
                    sample.context_idxes = crt_ctx_rmasp
                    sample.label = crt_lab
                    sample.aspect_charpos = crt_position
                    sample.aspect_wordpos = subposition
                    sample.left_wordpos = left_subposition
                    sample.right_wordpos = right_subposition
                    sample.local_idx2word = local_idx2word
                    samples.append(sample)

    samplepack.samples = samples
    samplepack.init_id2sample()
    return samplepack, idx_cnt