def _load_data(file_path, item2idx, idx_cnt, pad_idx, class_num): data = pd.read_csv(file_path, sep='\t', dtype={'itemId': np.int64}) print("read finish") # return data.sort_values(['sessionId', 'Time'], inplace=True) # 按照sessionid和时间升序排列 print("sort finish") # y = list(data.groupby('SessionId')) print("list finish") # tmp_data = dict(y) samplepack = Samplepack() samples = [] now_id = 0 print("I am reading") sample = Sample() last_id = None click_items = [] for s_id, item_id in zip(list(data['sessionId'].values), list(data['itemId'].values)): if last_id is None: last_id = s_id if s_id != last_id: item_dixes = [] for item in click_items: if item not in item2idx: if idx_cnt == pad_idx: idx_cnt += 1 item2idx[item] = idx_cnt idx_cnt += 1 item_dixes.append(item2idx[item]) in_dixes = item_dixes[:-1] out_dixes = item_dixes[1:] sample.id = now_id sample.session_id = last_id sample.click_items = click_items sample.items_idxes = item_dixes sample.in_idxes = in_dixes sample.out_idxes = out_dixes samples.append(sample) # print(sample) sample = Sample() last_id = s_id click_items = [] now_id += 1 else: last_id = s_id click_items.append(item_id) # click_items = list(tmp_data[session_tmp_idx]['ItemId']) sample = Sample() item_dixes = [] for item in click_items: if item not in item2idx: if idx_cnt == pad_idx: idx_cnt += 1 item2idx[item] = idx_cnt idx_cnt += 1 item_dixes.append(item2idx[item]) in_dixes = item_dixes[:-1] out_dixes = item_dixes[1:] sample.id = now_id sample.session_id = last_id sample.click_items = click_items sample.items_idxes = item_dixes sample.in_idxes = in_dixes sample.out_idxes = out_dixes samples.append(sample) print(sample) samplepack.samples = samples samplepack.init_id2sample() return samplepack, idx_cnt
def _load_data(file_path, item2idx, idx_cnt, pro=None, pad_idx=0): data = pd.read_csv(file_path, dtype={'ItemId': np.int64}) print("read finish") # return data.sort_values(['SessionId', 'Time'], inplace=True) # 按照sessionid和时间升序排列 print("sort finish") # y = list(data.groupby('SessionId')) print("list finish") # tmp_data = dict(y) session_data = list(data['SessionId'].values) item_event = list(data['ItemId'].values) if pro is not None: lenth = int(len(session_data) / pro) print(lenth) session_data = session_data[-lenth:] item_event = item_event[-lenth:] for i in range(len(session_data)): if session_data[i] != session_data[i + 1]: break session_data = session_data[i + 1:] item_event = item_event[i + 1:] lenth = len(session_data) print(lenth) samplepack = Samplepack() samples = [] now_id = 0 print("I am reading") sample = Sample() last_id = None click_items = [] for s_id, item_id in zip(session_data, item_event): if last_id is None: last_id = s_id if s_id != last_id: item_dixes = [] for item in click_items: if item not in item2idx: if idx_cnt == pad_idx: idx_cnt += 1 item2idx[item] = idx_cnt idx_cnt += 1 item_dixes.append(item2idx[item]) in_dixes = item_dixes[:-1] out_dixes = item_dixes[1:] sample.id = now_id sample.session_id = last_id sample.click_items = click_items sample.items_idxes = item_dixes sample.in_idxes = in_dixes sample.out_idxes = out_dixes samples.append(sample) # print(sample) sample = Sample() last_id = s_id click_items = [] now_id += 1 else: last_id = s_id click_items.append(item_id) # click_items = list(tmp_data[session_tmp_idx]['ItemId']) sample = Sample() item_dixes = [] for item in click_items: if item not in item2idx: if idx_cnt == pad_idx: idx_cnt += 1 item2idx[item] = idx_cnt idx_cnt += 1 item_dixes.append(item2idx[item]) in_dixes = item_dixes[:-1] out_dixes = item_dixes[1:] sample.id = now_id sample.session_id = last_id sample.click_items = click_items sample.items_idxes = item_dixes sample.in_idxes = in_dixes sample.out_idxes = out_dixes samples.append(sample) print(sample) samplepack.samples = samples samplepack.init_id2sample() return samplepack, idx_cnt
datas.append(cdatas) # solve last data for i in rand_idx[count:]: datas[-1].append(dataset[i]) tmp = copy.deepcopy(datas) for i in xrange(3): datas = copy.deepcopy(tmp) test_datas = datas.pop(i) trian_datas = [] path = "../datas/3cross/cross" + str(i + 1) + "/" for x in datas: trian_datas += x print len(test_datas) print len(trian_datas) samplepack_train = Samplepack() samplepack_train.samples = trian_datas samplepack_train.init_id2sample() samplepack_test = Samplepack() samplepack_test.samples = test_datas samplepack_test.init_id2sample() dump_file([samplepack_train, path + mid_dong_train_data], [samplepack_test, path + mid_dong_test_data]) # for i in xrange(3): # datas = copy.deepcopy(tmp) # train_datas = datas.pop(i) # # train_datas2 = datas.pop(i) # # train_datas = train_datas1+train_datas2 # test_datas = [] # path = "../datas/3train/cross" + str(i + 1) + "/"
def _load_data(file_path, word2idx, idx_cnt, pad_idx, class_num): # tree = ET.parse(file_path) # root = tree.getroot() samplepack = Samplepack() samples = [] now_id = 0 retdata = [] # the ret contexts = [] # the ret fullsents = [] # the full sentences. aspects = [] # the ret labels = [] # the ret positons = [] # the ret subpositions = [] # the subscript positions. rowtexts = [] # the row texts rowaspects = [] # the row aspects data = open(file_path, 'r') tmp = [] for line in data: tmp.append(line) for i in xrange(0, len(tmp), 3): sample = Sample() row_text = tmp[i].lower().strip() # left=row_text.split('$t$')[0] # row=row_text.split()[1] index1 = row_text.index('$t$') # rmasp_text = tokenize(left_row_text + " " + right_row_text) left_row_text = row_text[0:index1] right_row_text = row_text[index1 + 4:] left_tk_text = left_row_text.split() right_tk_text = right_row_text.split() crt_ctx_rmasp = [] crt_asp = [] crt_sent = [] subposition = [] left_subposition = [] right_subposition = [] left_tktext_idxes = [] right_tktext_idxes = [] local_idx2word = {} # the left part 2 ids. for w in left_tk_text: if w.lower() not in word2idx: if idx_cnt == pad_idx: idx_cnt += 1 word2idx[w.lower()] = idx_cnt idx_cnt += 1 left_tktext_idxes.append(word2idx[w.lower()]) local_idx2word[word2idx[w]] = w rasp = tmp[i + 1].lower().strip() crt_position = [index1, index1 + len(rasp)] asps = rasp.split() for w in asps: if w.lower() not in word2idx: if idx_cnt == pad_idx: idx_cnt += 1 word2idx[w.lower()] = idx_cnt idx_cnt += 1 crt_asp.append(word2idx[w.lower()]) local_idx2word[word2idx[w]] = w # the right part 2 ids. for w in right_tk_text: if w.lower() not in word2idx: if idx_cnt == pad_idx: idx_cnt += 1 word2idx[w.lower()] = idx_cnt idx_cnt += 1 right_tktext_idxes.append(word2idx[w.lower()]) local_idx2word[word2idx[w]] = w # left + right 2 crt_ctx_rmasp crt_ctx_rmasp.extend(left_tktext_idxes) crt_ctx_rmasp.extend(right_tktext_idxes) left_subposition.append(len(crt_sent)) crt_sent.extend(left_tktext_idxes) left_subposition.append(len(crt_sent)) subposition.append(len(crt_sent)) crt_sent.extend(crt_asp) right_subposition.append(len(crt_sent)) subposition.append(len(crt_sent)) crt_sent.extend(right_tktext_idxes) right_subposition.append(len(crt_sent)) crt_lab = int(tmp[i + 2].strip()) if crt_lab == -1: crt_lab = 1 elif crt_lab == 1: crt_lab = 0 elif crt_lab == 0: crt_lab = 2 # text = row_text.lower().replace("$t$", tmp[i + 1]).split() # crt_ctx = [] # for w in text: # if w.lower() not in word2idx: # if idx_cnt == pad_idx: # idx_cnt += 1 # word2idx[w.lower()] = idx_cnt # idx_cnt += 1 # crt_ctx.append(word2idx[w.lower()]) if crt_lab != 3: if class_num == 2 and sample.label2pol(crt_lab) == 'neutral': continue sample.id = now_id sample.sent_id = now_id now_id += 1 sample.aspect = rasp sample.text = row_text.replace("$t$", tmp[i + 1].strip()) sample.aspect_idxes = crt_asp sample.text_idxes = crt_sent sample.left_context_idxes = left_tktext_idxes sample.right_context_idxes = right_tktext_idxes sample.context_idxes = crt_ctx_rmasp sample.label = crt_lab sample.aspect_charpos = crt_position sample.aspect_wordpos = subposition sample.left_wordpos = left_subposition sample.right_wordpos = right_subposition sample.local_idx2word = local_idx2word samples.append(sample) # crt_sent = [] # subposition = [] # crt_sent.extend(left_tktext_idxes) # subposition.append(len(crt_sent)) # crt_sent.extend(crt_asp) # subposition.append(len(crt_sent) - 1) # crt_sent.extend(right_tktext_idxes) # # the full sentence. consists of the idxes. # crt_sent = [] # subposition = [] # crt_sent.extend(left_tktext_idxes) # subposition.append(len(crt_sent)) # crt_sent.extend(crt_asp) # subposition.append(len(crt_sent) - 1) # crt_sent.extend(right_tktext_idxes) # crt_position = [len(left), len(left) + len(rasp)] # print crt_pol_text # crt_lab = -1 # if crt_pol_text == "1": # crt_lab = 1 # elif crt_pol_text == "-1": # crt_lab = 2 # elif crt_pol_text == "0": # crt_lab = 0 # else: # crt_lab = 3 # if crt_lab != 3: # contexts.append(crt_ctx_rmasp) # aspects.append(crt_asp) # positons.append(crt_position) # labels.append(crt_lab) # rowtexts.append(row_text) # rowaspects.append(rasp) # fullsents.append(crt_sent) # subpositions.append(subposition) # retdata = [contexts, aspects, labels, positons, # rowtexts, rowaspects, fullsents, subpositions] # print labels samplepack.samples = samples samplepack.init_id2sample() return samplepack, idx_cnt
def _load_data(file_path, word2idx, idx_cnt, pad_idx, class_num): tree = ET.parse(file_path) root = tree.getroot() samplepack = Samplepack() samples = [] now_id = 0 for sentence in root: row_text = sentence.find("text").text.lower() sent_id = sentence.get("id") for asp_terms in sentence.iter('aspectTerms'): # iter the aspects of on sentence. for asp_term in asp_terms.findall('aspectTerm'): sample = Sample() rasp = asp_term.get("term").lower() asps = tokenize(rasp) crt_pol_text = asp_term.get("polarity") crt_from = int(asp_term.get("from")) crt_to = int(asp_term.get("to")) crt_position = [crt_from, crt_to] # remove the aspect from the context. consists of the idxes. crt_ctx_rmasp = [] crt_asp = [] crt_sent = [] subposition = [] left_subposition = [] right_subposition = [] left_tktext_idxes = [] right_tktext_idxes = [] local_idx2word = {} left_row_text = row_text[0:crt_from] right_row_text = row_text[crt_to:] # rmasp_text = tokenize(left_row_text + " " + right_row_text) left_tk_text = tokenize(left_row_text) right_tk_text = tokenize(right_row_text) # the left part 2 ids. for w in left_tk_text: if w not in word2idx: if idx_cnt == pad_idx: idx_cnt += 1 word2idx[w] = idx_cnt idx_cnt += 1 left_tktext_idxes.append(word2idx[w]) local_idx2word[word2idx[w]] = w # the aspect. for w in asps: if w not in word2idx: if idx_cnt == pad_idx: idx_cnt += 1 word2idx[w] = idx_cnt idx_cnt += 1 crt_asp.append(word2idx[w]) local_idx2word[word2idx[w]] = w # the right part 2 ids. for w in right_tk_text: if w not in word2idx: if idx_cnt == pad_idx: idx_cnt += 1 word2idx[w] = idx_cnt idx_cnt += 1 right_tktext_idxes.append(word2idx[w]) local_idx2word[word2idx[w]] = w # left + right 2 crt_ctx_rmasp crt_ctx_rmasp.extend(left_tktext_idxes) crt_ctx_rmasp.extend(right_tktext_idxes) # the full sentence. consists of the idxes. left_subposition.append(len(crt_sent)) crt_sent.extend(left_tktext_idxes) left_subposition.append(len(crt_sent)) subposition.append(len(crt_sent)) crt_sent.extend(crt_asp) right_subposition.append(len(crt_sent)) subposition.append(len(crt_sent)) crt_sent.extend(right_tktext_idxes) right_subposition.append(len(crt_sent)) crt_lab = sample.pol2label(crt_pol_text) if crt_lab != 3: if class_num == 2 and sample.label2pol( crt_lab) == 'neutral': continue sample.id = now_id now_id += 1 sample.sent_id = sent_id sample.aspect = rasp sample.text = row_text sample.aspect_idxes = crt_asp sample.text_idxes = crt_sent sample.left_context_idxes = left_tktext_idxes sample.right_context_idxes = right_tktext_idxes sample.context_idxes = crt_ctx_rmasp sample.label = crt_lab sample.aspect_charpos = crt_position sample.aspect_wordpos = subposition sample.left_wordpos = left_subposition sample.right_wordpos = right_subposition sample.local_idx2word = local_idx2word samples.append(sample) samplepack.samples = samples samplepack.init_id2sample() return samplepack, idx_cnt