コード例 #1
0
def solve(S):
    _S = list(reversed(S))
    row, col = len(S) + 1, len(_S) + 1
    C = [[-1 for j in range(col)] for i in range(row)]
    B = [[None for j in range(col)] for i in range(row)]
    LCS.LCS_length(S, _S, B, C)
    LCS.rebuid_solution(S, _S, B, C, len(S), len(_S))
コード例 #2
0
ファイル: LCSTest.py プロジェクト: mingxiao/codeeval
 def testLCS(self):
     s1 = "abc"
     s2 = "dabc"
     t1 = 'XMJYAUZ'
     t2 = 'MZJAWXU'
     t = LCS.lcs(t1,t2)
     print t
     self.assertEqual(t, "MJAU")
コード例 #3
0
def dist(a, b):
    """
    输入:向量A, 向量B
    输出:两个向量的欧式距离
    """
    a_new = copy.deepcopy(a)
    b_new = copy.deepcopy(b)
    return LCS.get_lcs_distance(a_new, b_new)
コード例 #4
0
ファイル: testing.py プロジェクト: aleperno/tp2tda
	def __init__(self,pal1,pal2):
		self.base = pal1
		self.posbase = 0
		self.objective = pal2
		self.lcs= LCS.lcs(pal1,pal2)
		self.poslcs = 0
		self.mem={}
		self.cost=0
コード例 #5
0
def dist(a, b):
    """
    输入:向量A, 向量B
    输出:两个向量的欧式距离
    """
    a_new = copy.deepcopy(a)
    b_new = copy.deepcopy(b)
    if operator.eq(a_new,b_new) == True:
        return 0
    return LCS.get_lcs_distance(a_new,b_new)
コード例 #6
0
    def init(self, rid_list):

        self.rid_list = random.sample(rid_list, 200 if len(rid_list) > 200 else len(rid_list))
        self.desc_filter = filter_desc.DescFilter()
        self.desc_filter.init('./pattern/book_name_pattern.json', './pattern/pen_name_pattern.json',
                              './pattern/desc_pattern.json')

        self.lcs = LCS()

        return True
コード例 #7
0
ファイル: testing2.py プロジェクト: aleperno/tp2tda
	def __init__(self,pal1,pal2):
		self.base = pal1
		self.posbase = 0
		self.objective = pal2
		self.lcs= LCS.lcs(pal1,pal2) #O(#pal1 * #pal2)
		self.poslcs = 0
		self.mem={}
		self.diff = len(pal2)-len(pal1) #O(1) 
		self.cost=0
		self.term = False if (self.diff >= 0) else True #O(1)
		self.inser = self.diff if (self.diff > 0) else 0 #O(1)
		self.borr = self.diff if (self.diff < 0) else 0 #O(1)
コード例 #8
0
def getLCSFromFiles(fileOne,
                    fileTwo,
                    ignoreLineDir=True,
                    ignoreComments=True,
                    ignoreFileSpecifics=False):
    """
	Process an LCS object given two files

	see getStringsFromFiles for arg info
	"""
    seqOne, seqTwo = getStringsFromFiles(fileOne, fileTwo, ignoreLineDir,
                                         ignoreComments, ignoreFileSpecifics)
    retLCS = LCS.LCS(seqOne, seqTwo)
    return retLCS
コード例 #9
0
def CalculateSilhouetteCoefficient(result):
    """
    计算聚类结果的轮廓系数
    :param result: 聚类结果
    :return: 轮廓系数
    """
    S = 0
    num_global = 0
    if len(result) == 1:
        return -1
    for i in range(len(result)):
        if len(result[i]) == 1:
            S = S + 0
            num_global = num_global + 1
        else:
            for j in range(len(result[i])):
                a = 0
                b_list = []
                num_global = num_global + 1
                for k in range(len(result[i])):
                    if j == k:
                        continue
                    a = a + LCS.get_lcs_distance(result[i][j],result[i][k])
                a = a/(len(result[i])-1)
                for i_tmp in range(len(result)):
                    if i == i_tmp:
                        continue
                    for j_tmp in range(len(result[i_tmp])):
                        b_list.append(LCS.get_lcs_distance(result[i][j],result[i_tmp][j_tmp]))
                if len(b_list) == 0:
                    return 0
                b = min(b_list)
                if max(a,b) == 0:
                    break
                S = S + (b - a)/max(a,b)
    S = S/num_global
    return S
コード例 #10
0
    def __calcEdgeWeights(self, content):

        edgeWeights = [
        ]  #holds the edge weight values where entry at index i is
        #the edge weight from the new article to the article with
        #id number i

        c2 = content.lower()
        Arr2len = len(c2)
        c2 = nltk.word_tokenize(c2)
        tagged2 = nltk.pos_tag(c2)
        c1 = [
            w for w, t in tagged2
            if t != "DT" and t != "IN" and t != "WP" and t != "WRB" and
            t != "WP$" and t != "WDT" and t != "EX" and t != "TO" and t != "RP"
        ]
        c2 = c2[0:500]

        for i in range(len(self.contents)):
            c1 = self.contents[i].lower()
            Arr1len = len(c1)
            c1 = nltk.word_tokenize(c1)
            tagged1 = nltk.pos_tag(c1)
            c1 = [
                w for w, t in tagged1 if t != "DT" and t != "IN" and t != "WP"
                and t != "WRB" and t != "WP$" and t != "WDT" and t != "EX"
                and t != "TO" and t != "RP"
            ]
            c1 = c1[0:500]

            weight = LCS.LCS(c1, c2)

            weight = weight / ((Arr1len + Arr2len) / 2)

            #weight = weight * weight

            edgeWeights.append(weight)

        edgeWeights.append(0)
        return edgeWeights
コード例 #11
0
# encoding=utf-8
import split_sentence
import select_keywords_sentence
import LCS
# import lcs_2

dataroot="dataset/test"
seacher_word="tencentMa1998"
question="built"

OriginData="%s/%s_origin.txt"%(dataroot,seacher_word)
SplitedData="%s/%s_split.txt"%(dataroot,seacher_word)
FliteredData="%s/%s_fliter.txt"%(dataroot,seacher_word)
ModeData="%s/%s_mode.txt"%(dataroot,question)

#句子中要过滤的词
KeyWords=["腾讯","马化腾","1998"]
# 要泛化的词
genKeyWords=["腾讯","马化腾","1998"]
# genKeyWord 要包含于 KeyWords

# question="公司,组织成立时间"


split_sentence._split_sentence(OriginData,SplitedData)

# test select keywords in sentence
select_keywords_sentence._find_keywords_sentence(SplitedData,FliteredData,KeyWords)
print LCS.lcs(FliteredData,ModeData,genKeyWords)
コード例 #12
0
#设定tp_end的迭代步长为--2:
tp_end = tp_start + 2

while index + 1 < len(CUT) and tp_end < len(TP_TIME):
    #初始化匹配值identity为空:
    IDENTITY = []

    #二分法查找由beat得出的sp_start&sp_end的时间对应在SP_TIME中的时间:
    T1 = SP_TIME[BC.binary_search(SP_TIME, float(sp_start))]
    T2 = SP_TIME[BC.binary_search(SP_TIME, float(sp_end))]

    T3 = TP_TIME[tp_start]
    while tp_end <= tp_start + len_tp * 3 and tp_end < len(TP_TIME):
        #以步长 2开始迭代运算tp_start+2到tp_start+len_tp*3的所有tp_start&tp_end区间对应的 Identity:
        #保存时,将identity值保存为IDENTITY第一列,将对应的tp_end值保存为IDENTITY第二列,方便之后读取tp_end:
        IDENTITY.append([LCS.lcs(T1, T2, T3, TP_TIME[tp_end]), tp_end])
        tp_end += 2

    #防止出现空集报错
    if IDENTITY != []:
        #将IDENTITY转化为numpy矩阵:
        IDENTITY = np.array(IDENTITY)
        #取identity第一列中的最大值对应的index:
        MAX_ID = np.argmax(IDENTITY, axis=0)
        #将sp_start,sp_end,tp_start,identity最大值对应的tp_end保存到MAX_SEG中:
        if (int(IDENTITY[MAX_ID[0]][1]) + 10) < len(TP_TIME):
            MAX_SEG.append([
                sp_start, sp_end, TP_TIME[tp_start],
                TP_TIME[int(IDENTITY[MAX_ID[0]][1]) + 10]
            ])
        else:
コード例 #13
0
    new_list1 = []
    new_list2 = []
    for i in range(size1):
        new_list1.append(choice(ascii_uppercase))
    for j in range(size2):
        new_list2.append(choice(ascii_uppercase))
    return new_list1, new_list2


if __name__ == "__main__":
    # input length of sequences
    m = int(input("Set the length of the first sequence: "))
    n = int(input("Set the length of the second sequence: "))

    # generate two random character sequences
    new_lists = generate_List(10, m, n)
    print(new_lists[0])
    print(new_lists[1])

    # create an instance of a naiveMethod algorithm and run it to solve the lcs problem
    nm = nm.naiveMethod()
    nm_lcs = lcs.LCS(new_lists[0], new_lists[1], nm)
    nm_result, nm_time = nm_lcs.run()
    print("The result: " + str(nm_result) + "\nThe time: " + str(nm_time))

    # create an instance of a dynamicProgramming algorithm and run it to solve the lcs problem
    dp = dp.dynamicProgramming(m, n)
    dp_lcs = lcs.LCS(new_lists[0], new_lists[1], dp)
    dp_result, dp_time = dp_lcs.run()
    print("The result: " + str(dp_result) + "\nThe time: " + str(dp_time - 1))
コード例 #14
0
ファイル: Project.py プロジェクト: kadharamkaushik/Plagerism
for file in os.listdir('.'):
     if (os.path.isfile(file)) and (file.endswith(".txt") and file!='message.txt'):
     	files.append(file)
for each_file1 in files:
	li=[each_file1]
	li_lcs=[each_file1]
	for each_file2 in files:
		if each_file1!=each_file2:
			file1=open(each_file1,'r')
			file2 = open(each_file2,'r')
			file1_string=file1.read()
			file2_string=file2.read()
			word_list1=(words_count(file1_string))
			word_list2=(words_count(file2_string))
			li.append(round((Euclidean_norm(word_list1,word_list2)),2))
			li_lcs.append((LCS.longestSubstringFinder(file1_string,file2_string)))
		else:
			li.append('X')
			li_lcs.append('X')
	bag_of_words.append(li)
	longest_sequence_substring.append(li_lcs)
#print (longest_sequence_substring)
print ('Executed on : '+str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
print ("*********Bag of words********")
_print_matrix_(bag_of_words)
print ("*********Longest Sequence Substring********") 
_print_matrix_(longest_sequence_substring)
print('---------------------------------------------------------------------------------')
sys.stdout = old_stdout

log_file.close()
コード例 #15
0
import LCS
import en_Num
import ko_Num
import no_single_space
import JoongangDaily
import ngram
import lcslib
#count = no_single_space.no_single_space(0,6)
#print(count)

#lcslib.check_answer([],1,"word_fill",5,0.3)

#ngram.storeDictionary("final_dic.csv","dictionary.csv",266081)

#LCS2.run(0,1831,dic,5,5,3,9)
#LCS.run2(1,5,3,3)

# JoongangDaily.save_content(0,10)
# no_single_space.no_single_space(0,10)

#print("number_ko")

#ko_Num.ko_Num(0,10)
#print("number_en")

#en_Num.en_Num(0,10)
#print("LCS")
#8217
root = ngram.getRoot("../dictionary.csv")
LCS.run(1, 1, root, 5, 5, 3, 3)
コード例 #16
0
def Similarity(a,b): #getting the similarity from the LCS
    k = mod13.find_lcs_len(a,b)/math.sqrt(len(a)*len(b) )
    print("similarity: ", k)
    return k
コード例 #17
0
            N += 1
        print(index - TP_INDEX, N - 1)
        Comp_End = float(TP_BEAT[index])
        if index - TP_INDEX != N - 1:
            #当SP与TP相隔beat数不相等时,以match结果为准
            #先删除掉beat检测的结果:
            for m in range(1, index - TP_INDEX):
                TP_BEAT.pop(TP_INDEX + 1)

            for n in range(1, N - 1):
                Comp_Start = float(TP_BEAT[TP_INDEX + n - 1])
                end = Comp_Start + 0.2
                MaxComp = 0
                MaxBoundary = Comp_Start
                while end < Comp_End:
                    Comp = LCS.lcs(SP_BEAT[SP_INDEX + n - 1],
                                   SP_BEAT[SP_INDEX + n], Comp_Start, end)
                    end += 0.2
                    MaxComp = max(MaxComp, Comp)
                    if MaxComp == Comp:
                        MaxBoundary = end
                TP_BEAT.insert(TP_INDEX + n, str(MaxBoundary))

                #再将match的boundary添加进来:
#            for n in range(1,N-1):
#                temp_index = BC.binary_search(MATCH1,float(SP_BEAT[SP_INDEX+n]))
#                TP_BEAT.insert(TP_INDEX+n,MATCH[temp_index][1])
            SP_INDEX += N - 1
            TP_INDEX += N - 1
            continue
        else:
            #当SP与TP相隔beat数相等时,比较两者的匹配度Comp,以匹配度高者为新的boundary:
コード例 #18
0
idx = start_idx
while 1:

    if idx > end_idx:
        break

    if (idx % 10 != 0):
        idx = idx + 1
        continue

    print(str(idx) + ".txt")
    kLink, eLink, percent = wiki.check_all_pair(dic, idx)

    print("check all pair end")
    #check = wiki.make_file_for_LCS(kLink,eLink,root,idx)
    #print("make file for lcs end")
    if percent == -1:
        i += 1
        idx = idx + 1
        print("makefileforLCS error")
        continue

    en_Num.en_Num(idx, idx)
    print("en num end")
    ko_Num.ko_Num(idx, idx)
    print("ko num end")
    LCS.run(idx, idx, root, 5, 5, 3, 3, kLink, eLink)
    idx = idx + 1
#LCS2.run(0,1831,dic,5,5,3,9)
#LCS.run2(1,5,3,3)
コード例 #19
0
import LCS
import en_Num
import ko_Num
import div_eng
import div_kor
import herald_word_text
import ngram
import lcslib
#count = herald_word_text.herald_word_text(0,6)
#print(count)
#div_eng.div_eng(0,1800)
#div_kor.div_kor(0,1800)
#en_Num.en_Num(0,1798)
#ko_Num.ko_Num(0,1798)

#lcslib.check_answer([],1,"word_fill",5,0.3)

#ngram.storeDictionary("../../final_dic.csv","../../dictionary.csv",266081)
root = ngram.getRoot("../dictionary.csv")
LCS.run(1, 1798, root, 5, 5, 3, 3)
#LCS2.run(0,1831,dic,5,5,3,9)
#LCS.run2(1,5,3,3)
コード例 #20
0
#!/usr/bin/python
# Filename: Longest-Common-Subsequence.py

import random 
import LCSLength
import LCS

arrayX = [0, 'a', 'b', 'c', 'b', 'd', 'a', 'b']
arrayY = [0, 'b', 'd', 'c', 'a', 'b', 'a']

m = len(arrayX) - 1
n = len(arrayY) - 1

c = []
b = []

for i in range(0, m + 1):
	c.append([])
	for j in range(0, n + 1):
		c[i].append(0)

for i in range(0, m + 1):
	b.append([])
	for j in range(0, n + 1):
		b[i].append(0)

LCSLength.LCSLength(m, n, arrayX, arrayY, c, b)
LCS.LCS(m, n, arrayX, b)	
コード例 #21
0
 def test_ExpectedOutput(self):
     '''
     Checks if returned output is as expected.
     '''
     output = LCS.LCS(["abcabb", "bacb"])
     self.assertEqual(output, 3)
コード例 #22
0
    def mapping(self):
        """主体映射函数 """
        # beat初步映射:
        for item in self.sp_beat:
            sp_index = self.binary_search(self.old_match_sp, item)
            self.tp_beat_old.append(self.old_match[sp_index][1])

        for item in self.sp_beat:
            sp_index = self.binary_search(self.new_match_sp, item)
            self.tp_beat_new.append(self.new_match[sp_index][1])

        # 判断两个match的映射结果是否存在相同之处,相同的标记为1,否则为0:
        for i in range(len(self.tp_beat_old)):
            mean = 0.0
            if i == 0:
                self.tp_beat.append(float(self.tp_beat_new[0]))
                self.mark.append(1)
            else:
                if abs(
                        float(self.tp_beat_new[i]) -
                        float(self.tp_beat_old[i])) <= 0.1:
                    self.tp_beat.append(float(self.tp_beat_new[i]))
                    self.mark.append(1)
                else:
                    # 根据LCS算法求当前beat单元的近似度:
                    sp_start = self.sp_beat[i - 1]
                    sp_end = self.sp_beat[i]
                    tp_start = self.tp_beat[i - 1]
                    tp_new_end = self.tp_beat_new[i]
                    tp_old_end = self.tp_beat_old[i]
                    identity_old1 = LCS.lcs(
                        sp_start, sp_end, tp_start, tp_old_end,
                        self.save_directory + "/sp_note.csv",
                        self.save_directory + "/tp_note.csv")
                    identity_new1 = LCS.lcs(
                        sp_start, sp_end, tp_start, tp_new_end,
                        self.save_directory + "/sp_note.csv",
                        self.save_directory + "/tp_note.csv")
                    # 计算当前beat单元的gap,并和之前10个beat单元的平均gap值作比较,得出一个identity:
                    if i <= 10:
                        if identity_new1 >= identity_old1:
                            self.tp_beat.append(self.tp_beat_new[i])
                            self.mark.append(1)
                        else:
                            self.tp_beat.append(self.tp_beat_old[i])
                            self.mark.append(1)
                    else:
                        mean = (float(self.tp_beat[i - 1]) -
                                float(self.tp_beat[i - 11])) / 10
                        gap_old = float(self.tp_beat_old[i]) - float(
                            self.tp_beat[i - 1])
                        gap_new = float(self.tp_beat_new[i]) - float(
                            self.tp_beat[i - 1])
                        identity_old2 = (1 -
                                         abs(mean - gap_old) / mean) * 100 / 3
                        identity_new2 = (1 -
                                         abs(mean - gap_new) / mean) * 100 / 3
                        identity_old = identity_old1 + identity_old2
                        identity_new = identity_new1 + identity_new2
                        # print(i, identity_new1, identity_new2, identity_old1, identity_old2)
                        # 执行判断,取identity高者的beat时间为最终结果:
                        if identity_new >= identity_old:
                            self.tp_beat.append(self.tp_beat_new[i])
                            if identity_new > 28.0:
                                self.mark.append(1)
                            else:
                                self.mark.append(0)
                        else:
                            self.tp_beat.append(self.tp_beat_old[i])
                            if identity_old > 28.0:
                                self.mark.append(1)
                            else:
                                self.mark.append(0)
        return self.tp_beat
コード例 #23
0
class AuthorityDesc(object):

    def __init__(self):

        pass

    def init(self, rid_list):

        self.rid_list = random.sample(rid_list, 200 if len(rid_list) > 200 else len(rid_list))
        self.desc_filter = filter_desc.DescFilter()
        self.desc_filter.init('./pattern/book_name_pattern.json', './pattern/pen_name_pattern.json',
                              './pattern/desc_pattern.json')

        self.lcs = LCS()

        return True

    def __del__(self):

        pass

    def exit(self):

        return True

    def run(self):

        self.process()

        return True

    def process(self):

        print 'start processing'

        authority_info_list = self.fetch_authority_info()

        authority_info_len = len(authority_info_list)
        authority_info_index = 0

        for rid, book_name, pen_name, book_desc in authority_info_list:

            print 'index: {0}. tot: {1}'.format(authority_info_index, authority_info_len)
            authority_info_index += 1

            book_desc_update = self.check_authority_desc(book_name, pen_name, rid)

        print 'finish processing'

        return True

    def fetch_authority_info(self):

        print 'start fetching authority info'

        authority_info_list = []

        try:
            conn = MySQLdb.connect(host='10.46.7.171', port=4198, user='******', passwd='C9l3U4n6M2e1',
                                   db='novels_new')
            conn.set_character_set('GBK')
            conn.autocommit(True)
        except Exception as e:
            print 'fail to connect to the cluster db. error: {0}'.format(e)
            return authority_info_list

        cursor = conn.cursor()

        query_sql = 'SELECT rid, book_name, pen_name, description FROM novel_authority_info ' \
                    'WHERE rid = %s AND rank > 0'.format()
    
        for rid in self.rid_list:

            cursor.execute(query_sql, (rid,))
            row = cursor.fetchone()

            if not row:
                continue

            authority_info_list.append(row)

        cursor.close()
        conn.close()

        print 'finish fetching authority info. no: {0}'.format(len(authority_info_list))

        return authority_info_list

    def check_authority_desc(self, book_name, pen_name, rid):
        """
        """

        authority_desc = ''
        cluster_info = self.fetch_cluster_info(book_name, rid)
        cluster_info = sorted(cluster_info, key=lambda item: item[-1], reverse=True)

        if not cluster_info:
            return authority_desc

        native_desc_list = []
        for site_id, dir_id, dir_url, rank in cluster_info:

            row = self.fetch_native_desc(site_id, dir_id)
            if not row:
                continue

            raw_book_name, raw_pen_name, raw_desc = self.fetch_native_desc(site_id, dir_id)

            native_desc = self.desc_filter.filter_desc(site_id, *map(lambda uni_str: unicode(uni_str, 'GBK', 'ignore'),
                                                                     [raw_book_name, raw_pen_name, raw_desc]))
            if not self.is_valid_desc(native_desc):
                continue

            print '-' * 20
            print 'site_id: {0}'.format(site_id)
            print 'dir_id: {0}'.format(dir_id)
            print 'dir_url: {0}'.format(dir_url)
            #print 'raw_desc: {0}'.format(raw_desc)
            print 'native_desc: {0}'.format(native_desc.encode('GBK', 'ignore'))

            native_desc_list.append(native_desc)

        authority_desc = self.authority_desc_strategy(native_desc_list)
        authority_desc = authority_desc.replace(u'\u0003', unicode(book_name, 'GBK'))
        authority_desc = authority_desc.replace(u'\u0004', unicode(pen_name, 'GBK'))
        authority_desc = authority_desc.encode('GBK', 'ignore')

        dir_url = cluster_info[0][2]
        print '*' * 20
        print '\t'.join(map(str, [rid, book_name, pen_name, dir_url, authority_desc]))

        return authority_desc

    def is_valid_desc(self, desc):
        """
        """

        valid_desc_len_threshold = 5
        desc_filtered = re.sub(u'[^\u4e00-\u9fa5\w\u0003\u0004]', '', desc)

        return len(desc_filtered) >= valid_desc_len_threshold

    def authority_desc_strategy(self, native_desc_list):
        """
        """

        authority_desc = ''

        if not native_desc_list:
            return authority_desc

        native_desc_filtered_list = [re.sub(u'[^\u4e00-\u9fa5\w\s\u0003\u0004]', u'\u001a', native_desc)
                                     for native_desc in native_desc_list]

        key_sent_list = []
        key_sent_dict = {}
        group_elem_dict = {}
        for i, native_desc in enumerate(native_desc_filtered_list):

            key_sent = self.extract_key_sent(native_desc, u'\u001a')
            if not key_sent:
                continue

            key_sent_list.append(key_sent)

            group_index = key_sent_dict.get(key_sent, len(key_sent_dict))
            key_sent_dict.setdefault(key_sent, group_index)
            group_elem_dict.setdefault(group_index, set())
            group_elem_dict[group_index].add(i)

        native_desc_filtered_list = [re.sub(u'\s+', '', native_desc.replace(u'\u001a', u''))
                                     for native_desc in native_desc_filtered_list]
        group_lcs_dict = {}
        for group_index in group_elem_dict:

            self.lcs.init(*[native_desc_filtered_list[i] for i in group_elem_dict[group_index]])
            lcs = self.lcs.gen_lcs()
            group_lcs_dict[group_index] = lcs

        Jaccard_index_extend_threshold = 0.8
        for i in xrange(len(group_lcs_dict)):
            for j in xrange(i + 1, len(group_lcs_dict)):
                if not group_lcs_dict[j]:
                    continue

                self.lcs.init(group_lcs_dict[i], group_lcs_dict[j])
                lcs = self.lcs.gen_lcs()

                Jaccard_index_extend = len(lcs) / float(min(len(group_lcs_dict[i]), len(group_lcs_dict[j])))
                if Jaccard_index_extend >= Jaccard_index_extend_threshold:
                    group_elem_dict[i] |= group_elem_dict[j]
                    group_elem_dict[j] = set()

        max_score = 0
        max_group_index = -1
        for group_index in group_elem_dict:

            def calc_group_score(elem_set):
                return sum(map(lambda index: len(native_desc_filtered_list) - index, elem_set))

            group_score = calc_group_score(group_elem_dict[group_index])
            if group_score > max_score:
                max_score = group_score
                max_group_index = group_index

        potential_group= sorted(group_elem_dict[max_group_index], key=lambda index: len(native_desc_filtered_list[index]))
        authority_desc = native_desc_list[potential_group[(len(potential_group) - 1) / 2]]

        self.lcs.init(*[native_desc_filtered_list[i] for i in group_elem_dict[max_group_index]])
        f = lambda x: math.floor((1 - (math.tanh(math.log(x / 3.0)) + 1) / 2.0 * 0.5) * x)
        self.lcs.set_cs_threshold(f(len(group_elem_dict[max_group_index])))
        max_group_lcs = self.lcs.gen_lcs()

        max_group_lcs_threshold = len(native_desc_filtered_list[potential_group[(len(potential_group) - 1) / 2]]) * 0.5
        if len(max_group_lcs) >= max_group_lcs_threshold:
            for native_desc in native_desc_list:
                native_desc_filtered = extract_uni_str(native_desc, u'[\u4e00-\u9fa5\w\u0003\u0004]')
                if native_desc_filtered.find(max_group_lcs) == -1:
                    continue

                start = self.fetch_native_desc_start(native_desc, max_group_lcs)
                end = self.fetch_native_desc_end(native_desc, max_group_lcs)

                authority_desc = native_desc[start: end + 1]
                break

        if authority_desc:
            if re.search(authority_desc[-1], '\S+'):
                authority_desc += u'...'
            else:
                authority_desc += u'\u2026'

        return authority_desc

    def fetch_native_desc_start(self, native_desc, uni_str, left_punc_dict=left_punc_dict, right_punc_dict=right_punc_dict):
        """
        """

        start = 0

        potential_offset_list = [m.start() for m in re.finditer(uni_str[0], native_desc)]
        for offset in potential_offset_list:

            native_desc_part = native_desc[offset:]
            native_desc_part_filtered = extract_uni_str(native_desc_part, u'[\u4e00-\u9fa5\w\u0003\u0004]+')
            if native_desc_part_filtered[: len(uni_str)] == uni_str:
                start = offset
        
        punc_stack = list()
        punc_stack.append(start)
        for i in xrange(start):

            ch = native_desc[i]

            if ch in left_punc_dict:
                punc_stack.append(i)

            if ch in right_punc_dict:
                top = native_desc[punc_stack[-1]]
                if right_punc_dict.get(ch) == left_punc_dict.get(top):
                    punc_stack.pop()

        start = punc_stack.pop()
        start -= 1
        while start >= 0:

            if not extract_uni_str(native_desc[start], u'[\u4e00-\u9fa5\w\u0003\u0004]+'):
                break
            
            start -= 1

        start += 1
        return start

    def fetch_native_desc_end(self, native_desc, uni_str):
        """
        """

        native_desc_reversed = native_desc[:: -1]
        uni_str_reversed = uni_str[:: -1]

        start = self.fetch_native_desc_start(native_desc_reversed, uni_str_reversed, right_punc_dict, left_punc_dict)

        end = len(native_desc) - 1 - start

        return end

    def extract_key_sent(self, uni_str, sep):
        """
        """

        if not isinstance(uni_str, unicode):
            print 'uni_str is not an instance of unicode'

        key_sent = u''
        for sent in uni_str.split(sep):
            if len(sent) > len(key_sent):
                key_sent = sent

        return key_sent

    def fetch_cluster_info(self, book_name, rid):
        """
        """

        cluster_info = []

        try:
            conn = MySQLdb.connect(host='10.46.7.171', port=4198, user='******', passwd='C9l3U4n6M2e1', db='novels_new')
            conn.set_character_set('GBK')
            conn.autocommit(True)
        except Exception as e:
            print 'fail to connect to the cluster db. error: {0}'.format(e)
            return cluster_info

        cluster_table_id = get_novel_cluster_table_id(book_name.decode('GBK', 'ignore'))

        query_sql = 'SELECT site_id, dir_id, dir_url, rank FROM novel_cluster_info{0} WHERE cluster_id = {1}' \
                    ''.format(cluster_table_id, rid)

        cursor = conn.cursor()

        cursor.execute(query_sql)
        rows = cursor.fetchall()
        cluster_info = rows

        cursor.close()
        conn.close()

        return cluster_info

    def fetch_native_desc(self, site_id, dir_id):
        """
        """

        native_desc = ''

        try:
            conn = MySQLdb.connect(host='10.46.7.172', port=4195, user='******', passwd='H4k3D8v9X2y5', db='novels')
        except Exception as e:
            print 'fail to connect to the db format. error: {0}'.format(e)
            return native_desc

        query_sql = 'SELECT raw_book_name, raw_pen_name, description FROM dir_fmt_info{0} WHERE dir_id = {1}'.format(site_id, dir_id)

        cursor = conn.cursor()
        cursor.execute('SET NAMES GBK')
        cursor.execute('SET autocommit=1')

        cursor.execute(query_sql)
        row = cursor.fetchone()

        cursor.close()
        conn.close()

        return row

    def update_authority_info(self, authority_info_list):

        print 'start updating authority info. no: {0}'.format(len(authority_info_list))

        try:
            conn = MySQLdb.connect(host='10.46.7.171', port=4198, user='******', passwd='C9l3U4n6M2e1', db='novels_new')
        except Exception as e:
            print 'fail to connect to the cluster db. error: {0}'.format(e)
            return False

        update_sql = 'UPDATE novel_authority_info SET book_desc = %s WHERE rid = %s'

        cursor = conn.cursor()
        cursor.execute('SET NAMES GBK')
        cursor.execute('SET autocommit=1')

        for book_name, rid, book_desc in authority_info_list:

            cursor.execute(update_sql, (book_desc, rid))

        cursor.close()
        conn.close()

        print 'finish updating authority info'

        return True
コード例 #24
0
ファイル: main.py プロジェクト: gpzlx1/algorithm
import os
import time
import sys
import LCS

if __name__ == '__main__':
    size = readSize.readSize('input/size.txt')
    #每轮都会随机产生字符串,所以这里写入时为了保存,而不是读取
    inputA = open('input/inputA.txt', 'w')
    inputB = open('input/inputB.txt', 'w')
    timeTXT = open('output/time.txt', 'w')
    for key, value in size.items():
        for m, n in value:
            str1 = createString.createString(m)  #产生指定长度的字符串
            str2 = createString.createString(n)
            if (key == str('A')):  #将产生的字符串保存起来
                inputA.writelines(str1 + '\n')
                inputA.writelines(str2 + '\n')
            else:
                inputB.writelines(str1 + '\n')
                inputB.writelines(str2 + '\n')
            t1 = time.time()
            b, c = LCS.LCS(str1, str2)
            print('{}组规模为({}, {})的字符串组的LCS长度为:{},其中一个解为:'.format(
                key, m, n, c[-1][-1]),
                  end='')
            LCS.print_LCS(b, str1, m, n)
            print()
            t2 = time.time()
            timeTXT.writelines(str(t2 - t1) + ' ms\n')
コード例 #25
0
        tmp = div_english_korean(tmp)
        bodies = remove_tags(tmp)

        bodies = bodies.replace("=br=", "\n\n")

        bodies_split = bodies.split("!div_eng_kor!")

        if len(bodies_split) < 2:
            #x=x+1
            filenumber = filenumber + 1
            continue

        div_english_sentence(bodies_split, filenumber, csv2)
        div_korean_sentence(bodies_split, filenumber, csv2)
        filenumber = filenumber + 1

    csv2.close()


#get_html_csv()
print("save_content")
save_content(0, 8217)
no_single_space.no_single_space(0, 8217)

print("number_ko")
number_ko.number_ko(0, 8217)
print("number_en")
number_en.number_en(0, 8217)
print("LCS")
LCS.run(0, 8217)
コード例 #26
0
                        with tag('div', klass="bg-success"):
                            line('i', '', klass="fa fa-plus mr-3 pull-right")
                            line('h3', op[3:], klass="text-white ml-5")
                    elif op[:3] == 'REM':
                        with tag('div', klass="bg-danger"):
                            line('i', '', klass="fa fa-remove mr-3 pull-right")
                            line('h3', op[3:], klass="text-white ml-5")
                    elif op[:3] == 'LCS':
                        with tag('div', klass="bg-primary"):
                            line('i', '', klass="fa fa-star mr-3 pull-right")
                            line('h3', op[3:], klass="text-white ml-5")
        doc._append('<h6>Developed by github/Subhodeep and Shourya</h6>')
    return indent(doc.getvalue())


def saveMarkedUpContentToFile(content):
    with open('markedup.html', 'w') as f:
        f.write(content)


if __name__ == "__main__":
    A = getFileContent('textA.txt')
    B = getFileContent('textB.txt')
    ops = LCS.LongestCommonSubsequence(A, B)
    print('operations to convert A->B', ops)
    _A = ''.join(A)
    _B = ''.join(B)
    saveMarkedUpContentToFile(generateMarkUp(ops, _A, _B))
    webbrowser.open("http://localhost:63342/File_Differ/markedup.html",
                    autoraise=False)
コード例 #27
0
ファイル: project3.py プロジェクト: WillMc93/AS.605.620
        fileIO.init_outp(in_path, out_path)
    except IOError:
        print(f"Output path {out_path} could not be reached. Please make sure "
              "that the directory exists.")
        quit()

    # file iterators for loop
    gen1 = fileIO.gen_sequences(in_path)
    gen2 = fileIO.gen_sequences(in_path)

    # loop through sequence combinations
    for (lbl1, seq1), (lbl2, seq2) in product(gen1, gen2):
        # ignore same sequence
        if lbl1 == lbl2:
            continue

        # build the LCS matrix
        b_matrix, c_matrix = LCS.calc_lcs(seq1, seq2)

        # rebuild the LCS
        lcs_seq = LCS.build_seq(b_matrix, seq1, seq2)

        # command-line report
        print(f"LCS of {lbl1} and {lbl2} is {lcs_seq}.")

        # write LCS to output
        fileIO.write_outp(lbl1, lbl2, lcs_seq, out_path)

    # keeps output cleaner
    print("\n")