def main(): """main func""" # connection = sqlite3.connect("Testdb.db") connection = database.get_connect() check_db(connection) check_watched_files(connection) connection.close() master = tk.Tk() default_font = tkfont.nametofont("TkDefaultFont") default_font.configure(size=16) master.option_add("*Font", default_font) app = App(master, database.get_connect()) app.grid(row=0, column=0) # app.append_games(["Ich", "bin", "eine", "Liste"]) master.mainloop()
def crawl_lifecycle(): files = os.listdir(patent_config.crawed_uspto_text_dir) local_pids = [ int(x[0:-5]) for x in files if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit() ] local_pids = dict(zip(local_pids, [0] * len(local_pids))) conn = database.get_connect() cur = conn.cursor() cur.execute('select pid from patent') for r in cur.fetchall(): pid = int(r[0]) if not local_pids.has_key(pid): task_queue.put(pid, block=False) conn.close() add_threads(20) for thread in threads: thread[0].join() all_pids = [ int(x[0:-5]) for x in files if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit() ] files = os.listdir(patent_config.crawed_uspto_lifecycle_dir) local_pids = [ int(x[0:-5]) for x in files if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit() ] local_pids = dict(zip(local_pids, [0] * len(local_pids))) print len(all_pids), len(local_pids) while True: try: task_queue.get(block=False) except Queue.Empty: break reg = re.compile('\d+\/\d+,\d+') #*Appl\.\s*No\..*\d+').*?(\d+\/\d+,\d+)') spliter = re.compile('[/,]') for pid in all_pids: with open(patent_config.crawed_uspto_text_dir + '/' + str(pid) + '.html') as f: m = reg.search(f.read()) try: app_id = int(''.join(spliter.split(m.group()))) except Exception, e: print e print 'fail - on pid %d' % pid continue if not local_pids.has_key(pid): task_queue.put([pid, app_id], block=False) break
def watch_files_caller(): """schedule!""" connection = database.get_connect() check_watched_files(connection) finished = time.time() while not STOP_EVENT.is_set(): if time.time() >= finished + 5: check_watched_files(connection) connection.commit() # if changed: # callback_function() finished = time.time() else: time.sleep(1) connection.close()
def similarity(): conn = database.get_connect() cur = conn.cursor() cur.execute('select * from text') pids = [] abstracts = [] claims = [] descriptions = [] for r in cur: pids.append(r[0]) abstracts.append(r[1]) claims.append(r[2]) descriptions.append(r[3]) print len(abstracts) prefix = ['abstracts', 'claims', 'descriptions'] numpy.save(patent_config.similarity + '_pids', numpy.array(pids)) del pids for i, docs in enumerate([abstracts, claims, descriptions]): X_tfidf = vectorize(docs, 'tfidf') # dist = pairwise_distances(X_tfidf) # numpy.save(patent_config.similarity + '_' + prefix[i] + '_tfidf',dist,False) # del dist # X_count = vectorize(docs,'count') # dist = pairwise_distances(X_count) # numpy.save(patent_config.similarity + '_' + prefix[i] + '_count',dist,False) X_svd_tfidf = svd(X_tfidf, 100) del X_tfidf dist = pairwise_distances(X_svd_tfidf) del X_svd_tfidf numpy.save(patent_config.similarity + '_' + prefix[i] + '_svd_tfidf', dist, False) del dist # X_svd_count = svd(X_count,100) # dist = pairwise_distances(X_svd_count) # numpy.save(patent_config.similarity + '_' + prefix[i] + '_svd_count',dist,False) X_lda = lda(docs, 100) dist = pairwise_distances(X_lda) del X_lda numpy.save(patent_config.similarity + '_' + prefix[i] + '_lda', dist, False) del dist
def crawl(): files = os.listdir(patent_config.crawed_uspto_text_dir) local_pids = [ int(x[0:-5]) for x in files if len(x) > 5 and x[-5:] == '.html' and x[0:-5].isdigit() ] local_pids = dict(zip(local_pids, [0] * len(local_pids))) conn = database.get_connect() cur = conn.cursor() cur.execute('select pid from patent') for r in cur.fetchall(): pid = int(r[0]) if not local_pids.has_key(pid): task_queue.put(pid, block=False) conn.close() add_threads(20) for thread in threads: thread[0].join()
def main(): """main func""" connection = database.get_connect() cursor = connection.cursor() setup(cursor)