def solve(S): _S = list(reversed(S)) row, col = len(S) + 1, len(_S) + 1 C = [[-1 for j in range(col)] for i in range(row)] B = [[None for j in range(col)] for i in range(row)] LCS.LCS_length(S, _S, B, C) LCS.rebuid_solution(S, _S, B, C, len(S), len(_S))
def testLCS(self): s1 = "abc" s2 = "dabc" t1 = 'XMJYAUZ' t2 = 'MZJAWXU' t = LCS.lcs(t1,t2) print t self.assertEqual(t, "MJAU")
def dist(a, b): """ 输入:向量A, 向量B 输出:两个向量的欧式距离 """ a_new = copy.deepcopy(a) b_new = copy.deepcopy(b) return LCS.get_lcs_distance(a_new, b_new)
def __init__(self,pal1,pal2): self.base = pal1 self.posbase = 0 self.objective = pal2 self.lcs= LCS.lcs(pal1,pal2) self.poslcs = 0 self.mem={} self.cost=0
def dist(a, b): """ 输入:向量A, 向量B 输出:两个向量的欧式距离 """ a_new = copy.deepcopy(a) b_new = copy.deepcopy(b) if operator.eq(a_new,b_new) == True: return 0 return LCS.get_lcs_distance(a_new,b_new)
def init(self, rid_list): self.rid_list = random.sample(rid_list, 200 if len(rid_list) > 200 else len(rid_list)) self.desc_filter = filter_desc.DescFilter() self.desc_filter.init('./pattern/book_name_pattern.json', './pattern/pen_name_pattern.json', './pattern/desc_pattern.json') self.lcs = LCS() return True
def __init__(self,pal1,pal2): self.base = pal1 self.posbase = 0 self.objective = pal2 self.lcs= LCS.lcs(pal1,pal2) #O(#pal1 * #pal2) self.poslcs = 0 self.mem={} self.diff = len(pal2)-len(pal1) #O(1) self.cost=0 self.term = False if (self.diff >= 0) else True #O(1) self.inser = self.diff if (self.diff > 0) else 0 #O(1) self.borr = self.diff if (self.diff < 0) else 0 #O(1)
def getLCSFromFiles(fileOne, fileTwo, ignoreLineDir=True, ignoreComments=True, ignoreFileSpecifics=False): """ Process an LCS object given two files see getStringsFromFiles for arg info """ seqOne, seqTwo = getStringsFromFiles(fileOne, fileTwo, ignoreLineDir, ignoreComments, ignoreFileSpecifics) retLCS = LCS.LCS(seqOne, seqTwo) return retLCS
def CalculateSilhouetteCoefficient(result): """ 计算聚类结果的轮廓系数 :param result: 聚类结果 :return: 轮廓系数 """ S = 0 num_global = 0 if len(result) == 1: return -1 for i in range(len(result)): if len(result[i]) == 1: S = S + 0 num_global = num_global + 1 else: for j in range(len(result[i])): a = 0 b_list = [] num_global = num_global + 1 for k in range(len(result[i])): if j == k: continue a = a + LCS.get_lcs_distance(result[i][j],result[i][k]) a = a/(len(result[i])-1) for i_tmp in range(len(result)): if i == i_tmp: continue for j_tmp in range(len(result[i_tmp])): b_list.append(LCS.get_lcs_distance(result[i][j],result[i_tmp][j_tmp])) if len(b_list) == 0: return 0 b = min(b_list) if max(a,b) == 0: break S = S + (b - a)/max(a,b) S = S/num_global return S
def __calcEdgeWeights(self, content): edgeWeights = [ ] #holds the edge weight values where entry at index i is #the edge weight from the new article to the article with #id number i c2 = content.lower() Arr2len = len(c2) c2 = nltk.word_tokenize(c2) tagged2 = nltk.pos_tag(c2) c1 = [ w for w, t in tagged2 if t != "DT" and t != "IN" and t != "WP" and t != "WRB" and t != "WP$" and t != "WDT" and t != "EX" and t != "TO" and t != "RP" ] c2 = c2[0:500] for i in range(len(self.contents)): c1 = self.contents[i].lower() Arr1len = len(c1) c1 = nltk.word_tokenize(c1) tagged1 = nltk.pos_tag(c1) c1 = [ w for w, t in tagged1 if t != "DT" and t != "IN" and t != "WP" and t != "WRB" and t != "WP$" and t != "WDT" and t != "EX" and t != "TO" and t != "RP" ] c1 = c1[0:500] weight = LCS.LCS(c1, c2) weight = weight / ((Arr1len + Arr2len) / 2) #weight = weight * weight edgeWeights.append(weight) edgeWeights.append(0) return edgeWeights
# encoding=utf-8 import split_sentence import select_keywords_sentence import LCS # import lcs_2 dataroot="dataset/test" seacher_word="tencentMa1998" question="built" OriginData="%s/%s_origin.txt"%(dataroot,seacher_word) SplitedData="%s/%s_split.txt"%(dataroot,seacher_word) FliteredData="%s/%s_fliter.txt"%(dataroot,seacher_word) ModeData="%s/%s_mode.txt"%(dataroot,question) #句子中要过滤的词 KeyWords=["腾讯","马化腾","1998"] # 要泛化的词 genKeyWords=["腾讯","马化腾","1998"] # genKeyWord 要包含于 KeyWords # question="公司,组织成立时间" split_sentence._split_sentence(OriginData,SplitedData) # test select keywords in sentence select_keywords_sentence._find_keywords_sentence(SplitedData,FliteredData,KeyWords) print LCS.lcs(FliteredData,ModeData,genKeyWords)
#设定tp_end的迭代步长为--2: tp_end = tp_start + 2 while index + 1 < len(CUT) and tp_end < len(TP_TIME): #初始化匹配值identity为空: IDENTITY = [] #二分法查找由beat得出的sp_start&sp_end的时间对应在SP_TIME中的时间: T1 = SP_TIME[BC.binary_search(SP_TIME, float(sp_start))] T2 = SP_TIME[BC.binary_search(SP_TIME, float(sp_end))] T3 = TP_TIME[tp_start] while tp_end <= tp_start + len_tp * 3 and tp_end < len(TP_TIME): #以步长 2开始迭代运算tp_start+2到tp_start+len_tp*3的所有tp_start&tp_end区间对应的 Identity: #保存时,将identity值保存为IDENTITY第一列,将对应的tp_end值保存为IDENTITY第二列,方便之后读取tp_end: IDENTITY.append([LCS.lcs(T1, T2, T3, TP_TIME[tp_end]), tp_end]) tp_end += 2 #防止出现空集报错 if IDENTITY != []: #将IDENTITY转化为numpy矩阵: IDENTITY = np.array(IDENTITY) #取identity第一列中的最大值对应的index: MAX_ID = np.argmax(IDENTITY, axis=0) #将sp_start,sp_end,tp_start,identity最大值对应的tp_end保存到MAX_SEG中: if (int(IDENTITY[MAX_ID[0]][1]) + 10) < len(TP_TIME): MAX_SEG.append([ sp_start, sp_end, TP_TIME[tp_start], TP_TIME[int(IDENTITY[MAX_ID[0]][1]) + 10] ]) else:
new_list1 = [] new_list2 = [] for i in range(size1): new_list1.append(choice(ascii_uppercase)) for j in range(size2): new_list2.append(choice(ascii_uppercase)) return new_list1, new_list2 if __name__ == "__main__": # input length of sequences m = int(input("Set the length of the first sequence: ")) n = int(input("Set the length of the second sequence: ")) # generate two random character sequences new_lists = generate_List(10, m, n) print(new_lists[0]) print(new_lists[1]) # create an instance of a naiveMethod algorithm and run it to solve the lcs problem nm = nm.naiveMethod() nm_lcs = lcs.LCS(new_lists[0], new_lists[1], nm) nm_result, nm_time = nm_lcs.run() print("The result: " + str(nm_result) + "\nThe time: " + str(nm_time)) # create an instance of a dynamicProgramming algorithm and run it to solve the lcs problem dp = dp.dynamicProgramming(m, n) dp_lcs = lcs.LCS(new_lists[0], new_lists[1], dp) dp_result, dp_time = dp_lcs.run() print("The result: " + str(dp_result) + "\nThe time: " + str(dp_time - 1))
for file in os.listdir('.'): if (os.path.isfile(file)) and (file.endswith(".txt") and file!='message.txt'): files.append(file) for each_file1 in files: li=[each_file1] li_lcs=[each_file1] for each_file2 in files: if each_file1!=each_file2: file1=open(each_file1,'r') file2 = open(each_file2,'r') file1_string=file1.read() file2_string=file2.read() word_list1=(words_count(file1_string)) word_list2=(words_count(file2_string)) li.append(round((Euclidean_norm(word_list1,word_list2)),2)) li_lcs.append((LCS.longestSubstringFinder(file1_string,file2_string))) else: li.append('X') li_lcs.append('X') bag_of_words.append(li) longest_sequence_substring.append(li_lcs) #print (longest_sequence_substring) print ('Executed on : '+str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) print ("*********Bag of words********") _print_matrix_(bag_of_words) print ("*********Longest Sequence Substring********") _print_matrix_(longest_sequence_substring) print('---------------------------------------------------------------------------------') sys.stdout = old_stdout log_file.close()
import LCS import en_Num import ko_Num import no_single_space import JoongangDaily import ngram import lcslib #count = no_single_space.no_single_space(0,6) #print(count) #lcslib.check_answer([],1,"word_fill",5,0.3) #ngram.storeDictionary("final_dic.csv","dictionary.csv",266081) #LCS2.run(0,1831,dic,5,5,3,9) #LCS.run2(1,5,3,3) # JoongangDaily.save_content(0,10) # no_single_space.no_single_space(0,10) #print("number_ko") #ko_Num.ko_Num(0,10) #print("number_en") #en_Num.en_Num(0,10) #print("LCS") #8217 root = ngram.getRoot("../dictionary.csv") LCS.run(1, 1, root, 5, 5, 3, 3)
def Similarity(a,b): #getting the similarity from the LCS k = mod13.find_lcs_len(a,b)/math.sqrt(len(a)*len(b) ) print("similarity: ", k) return k
N += 1 print(index - TP_INDEX, N - 1) Comp_End = float(TP_BEAT[index]) if index - TP_INDEX != N - 1: #当SP与TP相隔beat数不相等时,以match结果为准 #先删除掉beat检测的结果: for m in range(1, index - TP_INDEX): TP_BEAT.pop(TP_INDEX + 1) for n in range(1, N - 1): Comp_Start = float(TP_BEAT[TP_INDEX + n - 1]) end = Comp_Start + 0.2 MaxComp = 0 MaxBoundary = Comp_Start while end < Comp_End: Comp = LCS.lcs(SP_BEAT[SP_INDEX + n - 1], SP_BEAT[SP_INDEX + n], Comp_Start, end) end += 0.2 MaxComp = max(MaxComp, Comp) if MaxComp == Comp: MaxBoundary = end TP_BEAT.insert(TP_INDEX + n, str(MaxBoundary)) #再将match的boundary添加进来: # for n in range(1,N-1): # temp_index = BC.binary_search(MATCH1,float(SP_BEAT[SP_INDEX+n])) # TP_BEAT.insert(TP_INDEX+n,MATCH[temp_index][1]) SP_INDEX += N - 1 TP_INDEX += N - 1 continue else: #当SP与TP相隔beat数相等时,比较两者的匹配度Comp,以匹配度高者为新的boundary:
idx = start_idx while 1: if idx > end_idx: break if (idx % 10 != 0): idx = idx + 1 continue print(str(idx) + ".txt") kLink, eLink, percent = wiki.check_all_pair(dic, idx) print("check all pair end") #check = wiki.make_file_for_LCS(kLink,eLink,root,idx) #print("make file for lcs end") if percent == -1: i += 1 idx = idx + 1 print("makefileforLCS error") continue en_Num.en_Num(idx, idx) print("en num end") ko_Num.ko_Num(idx, idx) print("ko num end") LCS.run(idx, idx, root, 5, 5, 3, 3, kLink, eLink) idx = idx + 1 #LCS2.run(0,1831,dic,5,5,3,9) #LCS.run2(1,5,3,3)
import LCS import en_Num import ko_Num import div_eng import div_kor import herald_word_text import ngram import lcslib #count = herald_word_text.herald_word_text(0,6) #print(count) #div_eng.div_eng(0,1800) #div_kor.div_kor(0,1800) #en_Num.en_Num(0,1798) #ko_Num.ko_Num(0,1798) #lcslib.check_answer([],1,"word_fill",5,0.3) #ngram.storeDictionary("../../final_dic.csv","../../dictionary.csv",266081) root = ngram.getRoot("../dictionary.csv") LCS.run(1, 1798, root, 5, 5, 3, 3) #LCS2.run(0,1831,dic,5,5,3,9) #LCS.run2(1,5,3,3)
#!/usr/bin/python # Filename: Longest-Common-Subsequence.py import random import LCSLength import LCS arrayX = [0, 'a', 'b', 'c', 'b', 'd', 'a', 'b'] arrayY = [0, 'b', 'd', 'c', 'a', 'b', 'a'] m = len(arrayX) - 1 n = len(arrayY) - 1 c = [] b = [] for i in range(0, m + 1): c.append([]) for j in range(0, n + 1): c[i].append(0) for i in range(0, m + 1): b.append([]) for j in range(0, n + 1): b[i].append(0) LCSLength.LCSLength(m, n, arrayX, arrayY, c, b) LCS.LCS(m, n, arrayX, b)
def test_ExpectedOutput(self): ''' Checks if returned output is as expected. ''' output = LCS.LCS(["abcabb", "bacb"]) self.assertEqual(output, 3)
def mapping(self): """主体映射函数 """ # beat初步映射: for item in self.sp_beat: sp_index = self.binary_search(self.old_match_sp, item) self.tp_beat_old.append(self.old_match[sp_index][1]) for item in self.sp_beat: sp_index = self.binary_search(self.new_match_sp, item) self.tp_beat_new.append(self.new_match[sp_index][1]) # 判断两个match的映射结果是否存在相同之处,相同的标记为1,否则为0: for i in range(len(self.tp_beat_old)): mean = 0.0 if i == 0: self.tp_beat.append(float(self.tp_beat_new[0])) self.mark.append(1) else: if abs( float(self.tp_beat_new[i]) - float(self.tp_beat_old[i])) <= 0.1: self.tp_beat.append(float(self.tp_beat_new[i])) self.mark.append(1) else: # 根据LCS算法求当前beat单元的近似度: sp_start = self.sp_beat[i - 1] sp_end = self.sp_beat[i] tp_start = self.tp_beat[i - 1] tp_new_end = self.tp_beat_new[i] tp_old_end = self.tp_beat_old[i] identity_old1 = LCS.lcs( sp_start, sp_end, tp_start, tp_old_end, self.save_directory + "/sp_note.csv", self.save_directory + "/tp_note.csv") identity_new1 = LCS.lcs( sp_start, sp_end, tp_start, tp_new_end, self.save_directory + "/sp_note.csv", self.save_directory + "/tp_note.csv") # 计算当前beat单元的gap,并和之前10个beat单元的平均gap值作比较,得出一个identity: if i <= 10: if identity_new1 >= identity_old1: self.tp_beat.append(self.tp_beat_new[i]) self.mark.append(1) else: self.tp_beat.append(self.tp_beat_old[i]) self.mark.append(1) else: mean = (float(self.tp_beat[i - 1]) - float(self.tp_beat[i - 11])) / 10 gap_old = float(self.tp_beat_old[i]) - float( self.tp_beat[i - 1]) gap_new = float(self.tp_beat_new[i]) - float( self.tp_beat[i - 1]) identity_old2 = (1 - abs(mean - gap_old) / mean) * 100 / 3 identity_new2 = (1 - abs(mean - gap_new) / mean) * 100 / 3 identity_old = identity_old1 + identity_old2 identity_new = identity_new1 + identity_new2 # print(i, identity_new1, identity_new2, identity_old1, identity_old2) # 执行判断,取identity高者的beat时间为最终结果: if identity_new >= identity_old: self.tp_beat.append(self.tp_beat_new[i]) if identity_new > 28.0: self.mark.append(1) else: self.mark.append(0) else: self.tp_beat.append(self.tp_beat_old[i]) if identity_old > 28.0: self.mark.append(1) else: self.mark.append(0) return self.tp_beat
class AuthorityDesc(object): def __init__(self): pass def init(self, rid_list): self.rid_list = random.sample(rid_list, 200 if len(rid_list) > 200 else len(rid_list)) self.desc_filter = filter_desc.DescFilter() self.desc_filter.init('./pattern/book_name_pattern.json', './pattern/pen_name_pattern.json', './pattern/desc_pattern.json') self.lcs = LCS() return True def __del__(self): pass def exit(self): return True def run(self): self.process() return True def process(self): print 'start processing' authority_info_list = self.fetch_authority_info() authority_info_len = len(authority_info_list) authority_info_index = 0 for rid, book_name, pen_name, book_desc in authority_info_list: print 'index: {0}. tot: {1}'.format(authority_info_index, authority_info_len) authority_info_index += 1 book_desc_update = self.check_authority_desc(book_name, pen_name, rid) print 'finish processing' return True def fetch_authority_info(self): print 'start fetching authority info' authority_info_list = [] try: conn = MySQLdb.connect(host='10.46.7.171', port=4198, user='******', passwd='C9l3U4n6M2e1', db='novels_new') conn.set_character_set('GBK') conn.autocommit(True) except Exception as e: print 'fail to connect to the cluster db. error: {0}'.format(e) return authority_info_list cursor = conn.cursor() query_sql = 'SELECT rid, book_name, pen_name, description FROM novel_authority_info ' \ 'WHERE rid = %s AND rank > 0'.format() for rid in self.rid_list: cursor.execute(query_sql, (rid,)) row = cursor.fetchone() if not row: continue authority_info_list.append(row) cursor.close() conn.close() print 'finish fetching authority info. no: {0}'.format(len(authority_info_list)) return authority_info_list def check_authority_desc(self, book_name, pen_name, rid): """ """ authority_desc = '' cluster_info = self.fetch_cluster_info(book_name, rid) cluster_info = sorted(cluster_info, key=lambda item: item[-1], reverse=True) if not cluster_info: return authority_desc native_desc_list = [] for site_id, dir_id, dir_url, rank in cluster_info: row = self.fetch_native_desc(site_id, dir_id) if not row: continue raw_book_name, raw_pen_name, raw_desc = self.fetch_native_desc(site_id, dir_id) native_desc = self.desc_filter.filter_desc(site_id, *map(lambda uni_str: unicode(uni_str, 'GBK', 'ignore'), [raw_book_name, raw_pen_name, raw_desc])) if not self.is_valid_desc(native_desc): continue print '-' * 20 print 'site_id: {0}'.format(site_id) print 'dir_id: {0}'.format(dir_id) print 'dir_url: {0}'.format(dir_url) #print 'raw_desc: {0}'.format(raw_desc) print 'native_desc: {0}'.format(native_desc.encode('GBK', 'ignore')) native_desc_list.append(native_desc) authority_desc = self.authority_desc_strategy(native_desc_list) authority_desc = authority_desc.replace(u'\u0003', unicode(book_name, 'GBK')) authority_desc = authority_desc.replace(u'\u0004', unicode(pen_name, 'GBK')) authority_desc = authority_desc.encode('GBK', 'ignore') dir_url = cluster_info[0][2] print '*' * 20 print '\t'.join(map(str, [rid, book_name, pen_name, dir_url, authority_desc])) return authority_desc def is_valid_desc(self, desc): """ """ valid_desc_len_threshold = 5 desc_filtered = re.sub(u'[^\u4e00-\u9fa5\w\u0003\u0004]', '', desc) return len(desc_filtered) >= valid_desc_len_threshold def authority_desc_strategy(self, native_desc_list): """ """ authority_desc = '' if not native_desc_list: return authority_desc native_desc_filtered_list = [re.sub(u'[^\u4e00-\u9fa5\w\s\u0003\u0004]', u'\u001a', native_desc) for native_desc in native_desc_list] key_sent_list = [] key_sent_dict = {} group_elem_dict = {} for i, native_desc in enumerate(native_desc_filtered_list): key_sent = self.extract_key_sent(native_desc, u'\u001a') if not key_sent: continue key_sent_list.append(key_sent) group_index = key_sent_dict.get(key_sent, len(key_sent_dict)) key_sent_dict.setdefault(key_sent, group_index) group_elem_dict.setdefault(group_index, set()) group_elem_dict[group_index].add(i) native_desc_filtered_list = [re.sub(u'\s+', '', native_desc.replace(u'\u001a', u'')) for native_desc in native_desc_filtered_list] group_lcs_dict = {} for group_index in group_elem_dict: self.lcs.init(*[native_desc_filtered_list[i] for i in group_elem_dict[group_index]]) lcs = self.lcs.gen_lcs() group_lcs_dict[group_index] = lcs Jaccard_index_extend_threshold = 0.8 for i in xrange(len(group_lcs_dict)): for j in xrange(i + 1, len(group_lcs_dict)): if not group_lcs_dict[j]: continue self.lcs.init(group_lcs_dict[i], group_lcs_dict[j]) lcs = self.lcs.gen_lcs() Jaccard_index_extend = len(lcs) / float(min(len(group_lcs_dict[i]), len(group_lcs_dict[j]))) if Jaccard_index_extend >= Jaccard_index_extend_threshold: group_elem_dict[i] |= group_elem_dict[j] group_elem_dict[j] = set() max_score = 0 max_group_index = -1 for group_index in group_elem_dict: def calc_group_score(elem_set): return sum(map(lambda index: len(native_desc_filtered_list) - index, elem_set)) group_score = calc_group_score(group_elem_dict[group_index]) if group_score > max_score: max_score = group_score max_group_index = group_index potential_group= sorted(group_elem_dict[max_group_index], key=lambda index: len(native_desc_filtered_list[index])) authority_desc = native_desc_list[potential_group[(len(potential_group) - 1) / 2]] self.lcs.init(*[native_desc_filtered_list[i] for i in group_elem_dict[max_group_index]]) f = lambda x: math.floor((1 - (math.tanh(math.log(x / 3.0)) + 1) / 2.0 * 0.5) * x) self.lcs.set_cs_threshold(f(len(group_elem_dict[max_group_index]))) max_group_lcs = self.lcs.gen_lcs() max_group_lcs_threshold = len(native_desc_filtered_list[potential_group[(len(potential_group) - 1) / 2]]) * 0.5 if len(max_group_lcs) >= max_group_lcs_threshold: for native_desc in native_desc_list: native_desc_filtered = extract_uni_str(native_desc, u'[\u4e00-\u9fa5\w\u0003\u0004]') if native_desc_filtered.find(max_group_lcs) == -1: continue start = self.fetch_native_desc_start(native_desc, max_group_lcs) end = self.fetch_native_desc_end(native_desc, max_group_lcs) authority_desc = native_desc[start: end + 1] break if authority_desc: if re.search(authority_desc[-1], '\S+'): authority_desc += u'...' else: authority_desc += u'\u2026' return authority_desc def fetch_native_desc_start(self, native_desc, uni_str, left_punc_dict=left_punc_dict, right_punc_dict=right_punc_dict): """ """ start = 0 potential_offset_list = [m.start() for m in re.finditer(uni_str[0], native_desc)] for offset in potential_offset_list: native_desc_part = native_desc[offset:] native_desc_part_filtered = extract_uni_str(native_desc_part, u'[\u4e00-\u9fa5\w\u0003\u0004]+') if native_desc_part_filtered[: len(uni_str)] == uni_str: start = offset punc_stack = list() punc_stack.append(start) for i in xrange(start): ch = native_desc[i] if ch in left_punc_dict: punc_stack.append(i) if ch in right_punc_dict: top = native_desc[punc_stack[-1]] if right_punc_dict.get(ch) == left_punc_dict.get(top): punc_stack.pop() start = punc_stack.pop() start -= 1 while start >= 0: if not extract_uni_str(native_desc[start], u'[\u4e00-\u9fa5\w\u0003\u0004]+'): break start -= 1 start += 1 return start def fetch_native_desc_end(self, native_desc, uni_str): """ """ native_desc_reversed = native_desc[:: -1] uni_str_reversed = uni_str[:: -1] start = self.fetch_native_desc_start(native_desc_reversed, uni_str_reversed, right_punc_dict, left_punc_dict) end = len(native_desc) - 1 - start return end def extract_key_sent(self, uni_str, sep): """ """ if not isinstance(uni_str, unicode): print 'uni_str is not an instance of unicode' key_sent = u'' for sent in uni_str.split(sep): if len(sent) > len(key_sent): key_sent = sent return key_sent def fetch_cluster_info(self, book_name, rid): """ """ cluster_info = [] try: conn = MySQLdb.connect(host='10.46.7.171', port=4198, user='******', passwd='C9l3U4n6M2e1', db='novels_new') conn.set_character_set('GBK') conn.autocommit(True) except Exception as e: print 'fail to connect to the cluster db. error: {0}'.format(e) return cluster_info cluster_table_id = get_novel_cluster_table_id(book_name.decode('GBK', 'ignore')) query_sql = 'SELECT site_id, dir_id, dir_url, rank FROM novel_cluster_info{0} WHERE cluster_id = {1}' \ ''.format(cluster_table_id, rid) cursor = conn.cursor() cursor.execute(query_sql) rows = cursor.fetchall() cluster_info = rows cursor.close() conn.close() return cluster_info def fetch_native_desc(self, site_id, dir_id): """ """ native_desc = '' try: conn = MySQLdb.connect(host='10.46.7.172', port=4195, user='******', passwd='H4k3D8v9X2y5', db='novels') except Exception as e: print 'fail to connect to the db format. error: {0}'.format(e) return native_desc query_sql = 'SELECT raw_book_name, raw_pen_name, description FROM dir_fmt_info{0} WHERE dir_id = {1}'.format(site_id, dir_id) cursor = conn.cursor() cursor.execute('SET NAMES GBK') cursor.execute('SET autocommit=1') cursor.execute(query_sql) row = cursor.fetchone() cursor.close() conn.close() return row def update_authority_info(self, authority_info_list): print 'start updating authority info. no: {0}'.format(len(authority_info_list)) try: conn = MySQLdb.connect(host='10.46.7.171', port=4198, user='******', passwd='C9l3U4n6M2e1', db='novels_new') except Exception as e: print 'fail to connect to the cluster db. error: {0}'.format(e) return False update_sql = 'UPDATE novel_authority_info SET book_desc = %s WHERE rid = %s' cursor = conn.cursor() cursor.execute('SET NAMES GBK') cursor.execute('SET autocommit=1') for book_name, rid, book_desc in authority_info_list: cursor.execute(update_sql, (book_desc, rid)) cursor.close() conn.close() print 'finish updating authority info' return True
import os import time import sys import LCS if __name__ == '__main__': size = readSize.readSize('input/size.txt') #每轮都会随机产生字符串,所以这里写入时为了保存,而不是读取 inputA = open('input/inputA.txt', 'w') inputB = open('input/inputB.txt', 'w') timeTXT = open('output/time.txt', 'w') for key, value in size.items(): for m, n in value: str1 = createString.createString(m) #产生指定长度的字符串 str2 = createString.createString(n) if (key == str('A')): #将产生的字符串保存起来 inputA.writelines(str1 + '\n') inputA.writelines(str2 + '\n') else: inputB.writelines(str1 + '\n') inputB.writelines(str2 + '\n') t1 = time.time() b, c = LCS.LCS(str1, str2) print('{}组规模为({}, {})的字符串组的LCS长度为:{},其中一个解为:'.format( key, m, n, c[-1][-1]), end='') LCS.print_LCS(b, str1, m, n) print() t2 = time.time() timeTXT.writelines(str(t2 - t1) + ' ms\n')
tmp = div_english_korean(tmp) bodies = remove_tags(tmp) bodies = bodies.replace("=br=", "\n\n") bodies_split = bodies.split("!div_eng_kor!") if len(bodies_split) < 2: #x=x+1 filenumber = filenumber + 1 continue div_english_sentence(bodies_split, filenumber, csv2) div_korean_sentence(bodies_split, filenumber, csv2) filenumber = filenumber + 1 csv2.close() #get_html_csv() print("save_content") save_content(0, 8217) no_single_space.no_single_space(0, 8217) print("number_ko") number_ko.number_ko(0, 8217) print("number_en") number_en.number_en(0, 8217) print("LCS") LCS.run(0, 8217)
with tag('div', klass="bg-success"): line('i', '', klass="fa fa-plus mr-3 pull-right") line('h3', op[3:], klass="text-white ml-5") elif op[:3] == 'REM': with tag('div', klass="bg-danger"): line('i', '', klass="fa fa-remove mr-3 pull-right") line('h3', op[3:], klass="text-white ml-5") elif op[:3] == 'LCS': with tag('div', klass="bg-primary"): line('i', '', klass="fa fa-star mr-3 pull-right") line('h3', op[3:], klass="text-white ml-5") doc._append('<h6>Developed by github/Subhodeep and Shourya</h6>') return indent(doc.getvalue()) def saveMarkedUpContentToFile(content): with open('markedup.html', 'w') as f: f.write(content) if __name__ == "__main__": A = getFileContent('textA.txt') B = getFileContent('textB.txt') ops = LCS.LongestCommonSubsequence(A, B) print('operations to convert A->B', ops) _A = ''.join(A) _B = ''.join(B) saveMarkedUpContentToFile(generateMarkUp(ops, _A, _B)) webbrowser.open("http://localhost:63342/File_Differ/markedup.html", autoraise=False)
fileIO.init_outp(in_path, out_path) except IOError: print(f"Output path {out_path} could not be reached. Please make sure " "that the directory exists.") quit() # file iterators for loop gen1 = fileIO.gen_sequences(in_path) gen2 = fileIO.gen_sequences(in_path) # loop through sequence combinations for (lbl1, seq1), (lbl2, seq2) in product(gen1, gen2): # ignore same sequence if lbl1 == lbl2: continue # build the LCS matrix b_matrix, c_matrix = LCS.calc_lcs(seq1, seq2) # rebuild the LCS lcs_seq = LCS.build_seq(b_matrix, seq1, seq2) # command-line report print(f"LCS of {lbl1} and {lbl2} is {lcs_seq}.") # write LCS to output fileIO.write_outp(lbl1, lbl2, lcs_seq, out_path) # keeps output cleaner print("\n")