Ejemplo n.º 1
0
def findsimilarity(note):
    matchbooks = []
    h = codecs.open(note, "r")
    text = h.read()
    tags = jieba.analyse.extract_tags(text, topK=30)
    loadbookbase()

    tags = stopwordsfilter.stopwordsfilter(tags)

    # start matching
    for bookline in bookbase:
        # tagline is a set of tags for notes
        ismatch = 0
        matchtags = ""
        for tag in tags:
            # tag is one of the tags for testnote
            for mtag in bookline:
                # mtag is one of the tags in one file of the tagbase
                if tag == mtag:
                    # if there is a tag same as the testnote's tag, we add it to the list
                    ismatch = 1
                    matchtags = matchtags + tag + " "
                    break
        if ismatch == 1:
            matchbooks.append(bookline[0] + " " + matchtags)

    print "Original Notes"
    print note
    loglines.append("Original Notes: " + note + "\n")
    print "Related Books:"
    loglines.append("Related Books:" + "\n")
    for book in matchbooks:
        loglines.append(book + "\n")
        print book
    loglines.append("-------------------")
Ejemplo n.º 2
0
def findsimilarity(note):
    matchbooks = []
    h = codecs.open(note, 'r')
    text = h.read()
    tags = jieba.analyse.extract_tags(text, topK=30)
    loadbookbase()

    tags = stopwordsfilter.stopwordsfilter(tags)

    # start matching
    for bookline in bookbase:
        #tagline is a set of tags for notes
        ismatch = 0
        matchtags = ''
        for tag in tags:
            # tag is one of the tags for testnote
            for mtag in bookline:
                # mtag is one of the tags in one file of the tagbase
                if tag == mtag:
                    # if there is a tag same as the testnote's tag, we add it to the list
                    ismatch = 1
                    matchtags = matchtags + tag + " "
                    break
        if ismatch == 1:
            matchbooks.append(bookline[0] + " " + matchtags)

    print 'Original Notes'
    print note
    loglines.append('Original Notes: ' + note + '\n')
    print 'Related Books:'
    loglines.append('Related Books:' + '\n')
    for book in matchbooks:
        loglines.append(book + '\n')
        print book
    loglines.append('-------------------')
Ejemplo n.º 3
0
def findsimilarity(note):
    matchnotes = []
    h = codecs.open(note, 'r')
    text = h.read()
    tags = jieba.analyse.extract_tags(text, topK=30)
    loadtagbase()
    ismatch = 0

    tags = stopwordsfilter.stopwordsfilter(tags)

    # start matching
    for tagline in tagbase:
        #tagline is a set of tags for notes
        ismatch = 0
        matchtags = ''
        for tag in tags:
            # tag is one of the tags for testnote
            for mtag in tagline:
                # mtag is one of the tags in one file of the tagbase
                if tag == mtag:
                    # if there is a tag same as the testnote's tag, we add it to the list
                    ismatch = 1
                    matchtags = matchtags + tag + " "
                    break;
        if ismatch == 1:
            matchnotes.append(tagline[0] + " " + matchtags)

    print 'Original Notes'
    print note
    loglines.append('Original Notes: ' + note + '\n')
    print 'Similar Notes:'
    loglines.append('Related Notes:' + '\n')
    for nt in matchnotes:
        loglines.append(nt + '\n')
        print nt
    loglines.append('-------------------')
Ejemplo n.º 4
0
def noteproc():
    files = os.listdir(noteroot)
    taglist = []
    for f in files:
        if f[0] == '.': continue
        if os.path.isdir(noteroot + '/' + f): continue
        # load note
        h = codecs.open(noteroot + '/' + f, 'r')
        text = h.read()
        # clean timestamp in notes -> need to be a function
        # TODO eliminate ![]() tag
        #text = re.sub(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d','', text)
        #text = re.sub(r'\d\d\d\d-\d\d-\d\d','', text)
        # clean markdown syntax
        text = re.sub(r'!\[.*\]\(.*\)', '', text)
        text = re.sub(r'\(http.*\)', '', text)
        text = re.sub(r'#+', '', text)
        # get top 15 tags with weights
        tags = jieba.analyse.extract_tags(text, topK=30, withWeight=False)
        # store to a list
        output = f
        print f,

        tags = stopwordsfilter.stopwordsfilter(tags)

        for t in tags:
            print ' ' + t,
            output = output + u' ' + t
        print ''
        taglist.append(output + '\n')

    fh = codecs.open('./seg/tags.txt', 'w', 'utf-8')
    for t in taglist:
        fh.write(t)

    print 'done'
Ejemplo n.º 5
0
def noteproc():
    files = os.listdir(noteroot)
    taglist = []
    for f in files:
        if f[0] == '.': continue
        if os.path.isdir(noteroot+ '/' + f): continue
        # load note
        h = codecs.open(noteroot+ '/' + f, 'r')
        text = h.read()
        # clean timestamp in notes -> need to be a function
        # TODO eliminate ![]() tag
        #text = re.sub(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d','', text)
        #text = re.sub(r'\d\d\d\d-\d\d-\d\d','', text)
        # clean markdown syntax
        text = re.sub(r'!\[.*\]\(.*\)','',text)
        text = re.sub(r'\(http.*\)','',text)
        text = re.sub(r'#+','', text)
        # get top 15 tags with weights
        tags = jieba.analyse.extract_tags(text, topK=30, withWeight=False)
        # store to a list
        output = f
        print f,

        tags = stopwordsfilter.stopwordsfilter(tags)

        for t in tags:
            print ' ' + t,
            output = output + u' ' + t
        print ''
        taglist.append(output + '\n')

    fh = codecs.open('./seg/tags.txt', 'w', 'utf-8')
    for t in taglist:
        fh.write(t)

    print 'done'