def intention_graph(path): with open(path, "r") as file: lines = file.readlines() texts = list() intention_list = list() for line in lines: text = json.loads(line)["intention_list"] texts.append(text) # print _uniout.unescape(str(texts), 'utf8') for intentions in texts: for intention in intentions: if intention not in intention_list: intention_list.append(intention) # print len(intention_list) # print _uniout.unescape(str(intention_list), 'utf8') intention_graph = dict() for intention in intention_list: key = intention value = list() for x in texts: if key in x and x.index(key) + 1 < len(x) and x[x.index(key) + 1] not in value: value.append(x[x.index(key) + 1]) intention_graph[key] = value print _uniout.unescape(str(intention_graph), 'utf8') return intention_graph
def test_unicode(): with open('test_dump2.txt', 'w+') as out: with open('test_data.txt', 'r') as file: for line in file: text = json.loads(line) result = dict() question_list = text["question_list"] s = question_list[0] # print type(s) # s = s.decode("utf-8") print _uniout.unescape(s, "utf-8") result["question"] = s.decode("utf-8") out.write(json.dumps(result, ensure_ascii=False) + "\n")
if __name__ == '__main__': with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='\t') for line in reader: a = line[0] if a not in classes: classes.append(a) # print("classes:", _uniout.unescape(str(classes), 'utf8')) with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='\t') for line in reader: a = line[0] b = line[1] a_slots = a.split(',') if b not in inputs: inputs[b] = [] inputs[b].append(a) for x in inputs[b]: x_slots = x.split(',') if x_slots[0] == a_slots[0] and x_slots[1] != a_slots[1]: inputs[b].append(a) for inp, intentions in inputs.iteritems(): if len(intentions) >= 2: # results[inp] = intentions print(inp, _uniout.unescape(str(intentions), 'utf8')) # print(_uniout.unescape(str(results), 'utf8')) # print(_uniout.unescape(str(results['用卡取两百块']), 'utf8'))
i["noun"] = i["noun"].replace(item.decode("utf-8"), 'Transportation') ##9## for item in shouldReplaceList9: i["noun"] = i["noun"].replace(item.decode("utf-8"), 'Venue') # print type(i["noun"]) corpus += (i["noun"]) ### WordCounts(ALL) for doc in [corpus]: tf = Counter() for word in doc.split(): tf[word] += 1 for x,i in enumerate(tf.items()): print x+1,_uniout.unescape(str(i), 'utf8') ### 對 Dict 某條件下分組 # print "Original list:" # pprint.pprint(data) data.sort(key=operator.itemgetter('hotel')) # pprint.pprint(data) ### group the departments in lists list1 = [] for key, items in itertools.groupby(data, operator.itemgetter('hotel')): list1.append(list(items)) # print "After grouping the list by department:" # pprint.pprint(list1) ### create a list of department number and average age in each department
def cn(q): return _uniout.unescape(str(q), 'utf8')
def print_cn(q, others=''): print(_uniout.unescape(str(q), 'utf8'), others)
def jieba_cut(self, input_): seg = " ".join(jieba.cut(input_, cut_all=False)) tokens = _uniout.unescape(str(seg), 'utf8') return tokens
def print_cn(*q): print(_uniout.unescape(','.join(q), 'utf8'))
def cut(self, input_): input_ = QueryUtils.static_simple_remove_punct(input_) seg = " ".join(jieba.cut(input_, cut_all=False)) tokens = _uniout.unescape(str(seg), 'utf8') return tokens
def cut(self, input_): input_ = QueryUtils.static_remove_cn_punct(input_) tokens = jieba.cut(input_, cut_all=True) seg = " ".join(tokens) tokens = _uniout.unescape(str(seg), 'utf8') return tokens