def preRead(): global id2ch, ch2id, id2tg, vocabulary_size id2ch = ljqpy.LoadList('id2ch_w2v.txt') ch2id = {v: k for k, v in enumerate(id2ch)} id2tg = [] vocabulary_size = len(id2ch) print('vocabulary: %d' % (vocabulary_size))
def preRead(): global id2ch, ch2id, id2tg, vocabulary_size id2ch = ljqpy.LoadList(data_path + 'id2ch.txt') ch2id = {v: k for k, v in enumerate(id2ch)} id2tg = [] vocabulary_size = len(id2ch) print(vocabulary_size)
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None): if dict_file is not None and os.path.exists(dict_file): print('loading', dict_file) lst = ljqpy.LoadList(dict_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos + 1:]) return itokens, otokens data = ljqpy.LoadCSV(fn) wdicts = [{}, {}] for ss in data: for seq, wd in zip(ss, wdicts): for w in seq.split(delimiter): wd[w] = wd.get(w, 0) + 1 wlists = [] for wd in wdicts: wd = ljqpy.FreqDict2List(wd) wlist = [x for x, y in wd if y >= min_freq] wlists.append(wlist) print('seq 1 words:', len(wlists[0])) print('seq 2 words:', len(wlists[1])) itokens = TokenList(wlists[0]) otokens = TokenList(wlists[1]) if dict_file is not None: ljqpy.SaveList(wlists[0] + ['<@@@>'] + wlists[1], dict_file) return itokens, otokens
def TextRank(): for sen in ljqpy.LoadList('training/merged_text.txt'): if not ',' in sen: continue print(sen) for x, w in jieba.analyse.textrank(sen, withWeight=True, allowPOS=None): print('%s %s' % (x, w)) print('-' * 30)
def GetEdgeFromCoocc(): global datalist, datadict, idf, tags, r1cnt datalist = [] datadict = {} df = defaultdict(int) for jj in ljqpy.LoadList('training/all_data.txt'): jj = json.loads(jj) datadict[jj['id']] = jj['text'] tf = GetTags(jj['text']) for t in tf.keys(): df[t] += 1 jj['tf'] = tf datalist.append(jj) N = len(datalist) idf = {x: math.log(N / s) for x, s in df.items()} #ljqpy.SaveCSV(ljqpy.FreqDict2List(idf), 'saved_graph/idf.txt') tags = {x for x, s in df.items() if s > 2 and idf[x] > 2 and len(x) > 1} tags = {x for x in tags if not x.isdigit()} print('docu segs:', N) print('tags:', len(tags)) lasttts = [] r2cnt = defaultdict(int) r1cnt = defaultdict(int) for i, jj in enumerate(datalist): id, words = jj['id'], jj['tf'] tt = [x for x in words.keys() if x in tags] if i % 1000 == 0: print('datalist %d/%d' % (i, len(datalist))) for mi in range(3): if mi >= i: continue lid, lasttt = (id, tt) if i == 0 else lasttts[-mi] if lid.split('@')[0] != id.split('@')[0]: break for w1 in tt: for w2 in lasttt: if w1 in w2 or w2 in w1: continue if w2 < w1: w1, w2 = w2, w1 r2cnt[(w1, w2)] += 1 r1cnt[w1] += 1 r1cnt[w2] += 1 lasttts.append((id, tt)) if len(lasttts) > 10: lasttts = lasttts[5:] relscs = {} for g2, ng2 in ljqpy.FreqDict2List(r2cnt): for i, w in enumerate(g2): relscs[(w, g2[1 - i])] = ng2 / r1cnt[w] #print(g2, ng2, ng2/r1cnt[g2[0]], ng2/r1cnt[g2[1]]) if ng2 < 100: break with open('gen_rels/edges_coocc.txt', 'w', encoding='utf-8') as fout: for g2, rel in ljqpy.FreqDict2List(relscs): if rel < 0.2: break ljqpy.WriteLine(fout, ['coocc', g2[0], g2[1], rel])
def __init__(self,min_limit,stopword=''): self.min_limit=min_limit if type(stopword) is type([]): self.stopword = stopword elif type(stopword) is type(''): self.stopword = [' ','\r\n','\r','\n','\t','\u3000'] if os.path.exists(stopword): self.stopword += ljqpy.LoadList(stopword) else: print('stopword path not exists') self.fragment_count=0 self.documents=[] self.label={} self.article=[] self.keyword_docs=[] self.collect_arw_article()
def GetEdgeFromCNDB(): nodes = ljqpy.LoadList('saved_graph/graph_nodes.txt') node_set = set(nodes) step = 100 ems = {} with open('gen_rels/edges_kg.txt', 'w', encoding='utf-8') as fout: for ii in range(0, len(nodes), step): print('%d/%d' % (ii, len(nodes))) nslice = nodes[ii:ii + step] m2e = api.Ment2Ent(nslice) mes = {} for mm in nslice: ees = m2e.get(mm, []) if len(ees) == 0: continue ee = ees[0] if '歌' in ee or '影' in ee: continue mes[mm] = ee ems.setdefault(ee, []).append(mm) tris = api.GetEntTriplesMulti(list(mes.values()), keephref=True, nospecial=0) edges = [] for mm, ee in mes.items(): tri = tris.get(ee, []) ww = 0.5 + 0.5 / len(m2e.get(mm, [])) for p, o in tri: olinks = re.findall('<a.+?>(.+?)</a>', o) for olink in olinks: if olink == mm: continue if olink in node_set: edges.append(('KG', mm, olink, ww)) edges.append(('KGi', olink, mm, ww)) for x in edges: ljqpy.WriteLine(fout, x) for ee, mms in ems.items(): for i, m1 in enumerate(mms): for m2 in mms[:i]: ljqpy.WriteLine(fout, ['KGm', m1, m2, 1]) ljqpy.WriteLine(fout, ['KGm', m2, m1, 1])
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None): ''' 构建input和output sequence的 word或char list :param fn: :param min_freq: :param delimiter: :param dict_file: :return: ''' # 如果有word/char list则不需要重新构建 if dict_file is not None and os.path.exists(dict_file): print('loading', dict_file) lst = ljqpy.LoadList(dict_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos+1:]) return itokens, otokens # 如果没有则重新构建 data = ljqpy.LoadCSV(fn) wdicts = [{}, {}] for ss in data: for seq, wd in zip(ss, wdicts): for w in seq.split(delimiter): wd[w] = wd.get(w, 0) + 1 # nice code wlists = [] for wd in wdicts: wd = ljqpy.FreqDict2List(wd) wlist = [x for x,y in wd if y >= min_freq] wlists.append(wlist) print('seq 1 words:', len(wlists[0])) print('seq 2 words:', len(wlists[1])) itokens = TokenList(wlists[0]) otokens = TokenList(wlists[1]) if dict_file is not None: ljqpy.SaveList(wlists[0]+['<@@@>']+wlists[1], dict_file) return itokens, otokens
def Run(): global N, lst N = int(lst[0]) mat = lst[1:N + 1] lst = lst[N + 1:] mns = N * N mvv = sum([ sum([int(mat[x][y]) << (x * N + y) for y in range(N)]) for x in range(N) ]) ret = mns for v in range(2**mns): if v & mvv: continue nn = GetNum(v) if nn >= ret: continue if Check(mvv + v): ret = nn return '%d' % ret lst = ljqpy.LoadList('D-small-attempt0.in') outf = 'D-small-attempt0.out' with open(outf, 'w') as fout: N = int(lst[0]) lst = lst[1:] for k in range(N): fout.write('Case #%d: %s\n' % (1 + k, Run())) fout.flush() os.system('emeditor.exe ' + outf) print('completed')
z = q[qh] qh += 1 for i in edge.get(z, []): if mm[i] != -1: continue mm[i] = mm[z] + 1 q.append(i) return max(mm) #print(ls) rr = len(ls) for v in ls: rr += FindL(v) return max([rt, rr]) lst = ljqpy.LoadList('C-large.in') outf = 'C-large.out' with open(outf, 'w') as fout: ii = 0 T = int(lst[ii]) ii += 1 for k in range(T): n = int(lst[ii]) ii += 1 inp = tuple(map(int, lst[ii].split())) ii += 1 fout.write('Case #%d: %d\n' % (1 + k, Run(n, inp))) os.system('emeditor.exe ' + outf) print('completed')
def MakeMerged(): txts = [] for xx in ljqpy.LoadList('training/all_data.txt'): xx = json.loads(xx) txts.append(xx['text']) ljqpy.SaveList(txts, 'training/merged_text.txt')
random.seed(1333) st = set() while j > 0: v = [1] + [random.randint(0, 1) for x in range(n - 2)] + [1] zz = ''.join(str(x) for x in v) if zz in st: continue st.add(zz) rt = [] for b in range(2, 11): z = 0 for c in v: z = z * b + c rt.append(CheckNotPrime(z)) if rt[-1] == 0: break if rt[-1] != 0: print(zz, rt) fout.write(zz + ' ') fout.write(' '.join(str(x) for x in rt)) fout.write('\n') j -= 1 lst = ljqpy.LoadList('input.txt') outf = 'C-large.out' with open(outf, 'w') as fout: fout.write('Case #1:\n') Run(32, 500, fout) os.system(outf) print('completed')
for i in range(1, k + 1): for j in range(0, i + 1): if j > 0: f[i][j] += f[i - 1][j - 1] * pps[i - 1] f[i][j] += f[i - 1][j] * (1 - pps[i - 1]) return f[k][k // 2] def Run(p1, p2): n, k = map(int, p1.split()) plst = list(map(float, p2.split())) ret = 0.0 for v in range(2**n): pps = [plst[u] for u in range(n) if (v & (1 << u)) != 0] if len(pps) != k: continue ret = max(ret, Compute(pps)) return '%.8f' % ret lst = ljqpy.LoadList('B-small-attempt2.in') outf = 'B-small-attempt2.out' with open(outf, 'w') as fout: N = int(lst[0]) for k in range(N): fout.write('Case #%d: %s\n' % (1 + k, Run(lst[k * 2 + 1], lst[k * 2 + 2]))) fout.flush() os.system('emeditor.exe ' + outf) print('completed')