def main():
    """main func"""
    # connection = sqlite3.connect("Testdb.db")
    connection = database.get_connect()
    check_db(connection)
    check_watched_files(connection)
    connection.close()
    master = tk.Tk()
    default_font = tkfont.nametofont("TkDefaultFont")
    default_font.configure(size=16)
    master.option_add("*Font", default_font)
    app = App(master, database.get_connect())
    app.grid(row=0, column=0)
    # app.append_games(["Ich", "bin", "eine", "Liste"])
    master.mainloop()
Esempio n. 2
0
def crawl_lifecycle():
    files = os.listdir(patent_config.crawed_uspto_text_dir)
    local_pids = [
        int(x[0:-5]) for x in files
        if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit()
    ]
    local_pids = dict(zip(local_pids, [0] * len(local_pids)))

    conn = database.get_connect()
    cur = conn.cursor()
    cur.execute('select pid from patent')
    for r in cur.fetchall():
        pid = int(r[0])
        if not local_pids.has_key(pid):
            task_queue.put(pid, block=False)
    conn.close()

    add_threads(20)
    for thread in threads:
        thread[0].join()

    all_pids = [
        int(x[0:-5]) for x in files
        if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit()
    ]
    files = os.listdir(patent_config.crawed_uspto_lifecycle_dir)
    local_pids = [
        int(x[0:-5]) for x in files
        if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit()
    ]
    local_pids = dict(zip(local_pids, [0] * len(local_pids)))

    print len(all_pids), len(local_pids)
    while True:
        try:
            task_queue.get(block=False)
        except Queue.Empty:
            break

    reg = re.compile('\d+\/\d+,\d+')  #*Appl\.\s*No\..*\d+').*?(\d+\/\d+,\d+)')
    spliter = re.compile('[/,]')
    for pid in all_pids:
        with open(patent_config.crawed_uspto_text_dir + '/' + str(pid) +
                  '.html') as f:
            m = reg.search(f.read())
            try:
                app_id = int(''.join(spliter.split(m.group())))
            except Exception, e:
                print e
                print 'fail - on pid %d' % pid
                continue
        if not local_pids.has_key(pid):
            task_queue.put([pid, app_id], block=False)
        break
def watch_files_caller():
    """schedule!"""
    connection = database.get_connect()
    check_watched_files(connection)
    finished = time.time()
    while not STOP_EVENT.is_set():
        if time.time() >= finished + 5:
            check_watched_files(connection)
            connection.commit()
            # if changed:
            #     callback_function()
            finished = time.time()
        else:
            time.sleep(1)
    connection.close()
def similarity():
    conn = database.get_connect()
    cur = conn.cursor()
    cur.execute('select * from text')
    pids = []
    abstracts = []
    claims = []
    descriptions = []
    for r in cur:
        pids.append(r[0])
        abstracts.append(r[1])
        claims.append(r[2])
        descriptions.append(r[3])
    print len(abstracts)

    prefix = ['abstracts', 'claims', 'descriptions']

    numpy.save(patent_config.similarity + '_pids', numpy.array(pids))
    del pids

    for i, docs in enumerate([abstracts, claims, descriptions]):
        X_tfidf = vectorize(docs, 'tfidf')
        # dist = pairwise_distances(X_tfidf)
        # numpy.save(patent_config.similarity + '_' + prefix[i] + '_tfidf',dist,False)
        # del dist
        # X_count = vectorize(docs,'count')
        # dist = pairwise_distances(X_count)
        # numpy.save(patent_config.similarity + '_' + prefix[i] + '_count',dist,False)
        X_svd_tfidf = svd(X_tfidf, 100)
        del X_tfidf
        dist = pairwise_distances(X_svd_tfidf)
        del X_svd_tfidf
        numpy.save(patent_config.similarity + '_' + prefix[i] + '_svd_tfidf',
                   dist, False)
        del dist
        # X_svd_count = svd(X_count,100)
        # dist = pairwise_distances(X_svd_count)
        # numpy.save(patent_config.similarity + '_' + prefix[i] + '_svd_count',dist,False)
        X_lda = lda(docs, 100)
        dist = pairwise_distances(X_lda)
        del X_lda
        numpy.save(patent_config.similarity + '_' + prefix[i] + '_lda', dist,
                   False)
        del dist
Esempio n. 5
0
def crawl():
    files = os.listdir(patent_config.crawed_uspto_text_dir)
    local_pids = [
        int(x[0:-5]) for x in files
        if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit()
    ]
    local_pids = dict(zip(local_pids, [0] * len(local_pids)))

    conn = database.get_connect()
    cur = conn.cursor()
    cur.execute('select pid from patent')
    for r in cur.fetchall():
        pid = int(r[0])
        if not local_pids.has_key(pid):
            task_queue.put(pid, block=False)
    conn.close()

    add_threads(20)
    for thread in threads:
        thread[0].join()
Esempio n. 6
0
def main():
    """main func"""
    connection = database.get_connect()
    cursor = connection.cursor()
    setup(cursor)