def generate_tfidf_pickles(): """Gets all the read articles and considers those articles flagged as 's' as 1 and rest as 0 and produces the embeddings """ sqldb = connect_db(db_path) records = query_db(sqldb, '''select feedurl, author, id, title, content, flags from rss_item where unread=0 order by pubDate DESC;''') content_list = [] outcome_list = [] id_list = [] title_list = [] for record in records: # We should not judge the book by it's cover content_list.append('||'+ record['feedurl'] + '|| \n ||' + record['author'] + '|| \n ||' + record['title'] + '|| \n' + record['content']) outcome_list.append((record['flags'] is not None and 'r' not in record['flags'] and 's' in record['flags']) * 1) id_list.append(record['id']) # Yes, we are judging the book by it's cover but we are using the cool NLP model to judge title_list.append(record['title']) print("Total %d feed items found" %(len(content_list))) print(content_list[0]) # compute tfidf vectors with scikits v = TfidfVectorizer(input='content', encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, analyzer='word', stop_words='english', token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b', ngram_range=(1, 2), max_features = max_features, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, max_df=1.0, min_df=1) v.fit(content_list) print("Projecting them to a mathematical space..") X_tfidf = v.transform(content_list) X_smart = cool_nlp_model.encode(title_list) out = {} out['X_tfidf'] = X_tfidf out['X_smart'] = X_smart out['y'] = outcome_list out['v'] = v #print("writing", tfidf_path) safe_pickle_dump(out, tfidf_path) out = {} out['vocab'] = v.vocabulary_ out['idf'] = v._tfidf.idf_ out['ids'] = id_list out['idtoi'] = {x:i for i,x in enumerate(id_list)} #print("Writing Meta Data") safe_pickle_dump(out, meta_path)
def build_model(meta_path, tfidf_path): """ Given the embeddings, generate our preferences model using support vector machines """ meta = pickle.load(open(meta_path, 'rb')) out = pickle.load(open(tfidf_path, 'rb')) X_tfidf = out['X_tfidf'] X_tfidf = X_tfidf.todense().astype(np.float32) y = out['y'] y = np.array(y).astype(np.float32) X_smart = out['X_smart'] print('Learning your preferences...') clf = LinearSVC(class_weight='balanced', verbose=False, max_iter=1000000, tol=1e-6, C=0.1) clf.fit(X_tfidf, y) beclf = LinearSVC(class_weight='balanced', verbose=False, max_iter=1000000, tol=1e-6) beclf.fit(X_smart, y) model = {} model['db_name'] = db_path model['clf'] = clf model['beclf'] = beclf safe_pickle_dump(model, model_path)
def merge_dicts(dlist): m = {} for d in dlist: for k, v in d.items(): m[k] = m.get(k, 0) + v return m print('building an index for faster search...') search_dict = {} for pid, p in db.items(): dict_title = makedict(p['title'], forceidf=5, scale=3) dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5) dict_categories = {x['term'].lower(): 5 for x in p['tags']} if 'and' in dict_authors: # special case for "and" handling in authors list del dict_authors['and'] dict_summary = makedict(p['summary']) search_dict[pid] = merge_dicts( [dict_title, dict_authors, dict_categories, dict_summary]) CACHE['search_dict'] = search_dict # save the cache print('writing', Config.serve_cache_path) safe_pickle_dump(CACHE, Config.serve_cache_path) print('writing', Config.db_serve_path) safe_pickle_dump(db, Config.db_serve_path)
v = TfidfVectorizer(input='content', encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, analyzer='word', stop_words='english', token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b', ngram_range=(1, 2), max_features = 20000, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) X = v.fit_transform(txts) print v.vocabulary_ print X.shape # write full matrix out out = {} out['X'] = X # this one is heavy! print('writing tfidf.p') utils.safe_pickle_dump(out, "tfidf.p") # writing lighter metadata information into a separate (smaller) file out = {} out['vocab'] = v.vocabulary_ out['idf'] = v._tfidf.idf_ out['pids'] = pids # a full idvv string (id and version number) out['ptoi'] = { x:i for i,x in enumerate(pids) } # pid to ix in X mapping print('writing tfidf_meta.p') utils.safe_pickle_dump(out, "tfidf_meta.p") print 'precomputing nearest neighbor queries in batches...' X = X.todense() # originally it's a sparse matrix sim_dict = {} batch_size = 200 for i in xrange(0,len(pids),batch_size):
def merge_dicts(dlist): m = {} for d in dlist: for k, v in d.items(): m[k] = m.get(k, 0) + v return m pdf_files_dict = pdf_filewatcher.get_saved_pdf_files() print('building an index for faster search...') search_dict = {} for fp in pdf_files_dict: di = pdf_files_dict[fp] filename = di['filename'] tscore = di['tscore'] paper_date_modified = di['date_modified'] clean_filename = filename.split('.')[0] paper_text = di['txt'] if paper_text is None: paper_text = '' dict_title = makedict(clean_filename, forceidf=5, scale=3) dict_text = makedict(paper_text) final_dict = merge_dicts([dict_title, dict_text]) search_dict[fp] = final_dict print('writing search dict', utils.Config.search_dict_path) utils.safe_pickle_dump(search_dict, utils.Config.search_dict_path)
X = out['X'] X = X.todense() xtoi = { strip_version(x):i for x,i in meta['ptoi'].items() } user_sim = {} for ii,u in enumerate(users): print("%d/%d building an SVM for %s" % (ii, len(users), u['username'].encode('utf-8'))) uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print(pids) y = np.zeros(X.shape[0]) for ix in posix: y[ix] = 1 clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1) clf.fit(X,y) s = clf.decision_function(X) sortix = np.argsort(-s) sortix = sortix[:min(num_recommendations, len(sortix))] # crop paper recommendations to save space user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)] print('writing', Config.user_sim_path) safe_pickle_dump(user_sim, Config.user_sim_path)
print("%d/%d building an SVM for %s" % (ii, len(users), u['username'].encode('utf-8'))) uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print(pids) y = np.zeros(X.shape[0]) for ix in posix: y[ix] = 1 clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1) clf.fit(X, y) s = clf.decision_function(X) sortix = np.argsort(-s) # crop paper recommendations to save space sortix = sortix[:min(num_recommendations, len(sortix))] user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)] print('writing', Config.user_sim_path) safe_pickle_dump(user_sim, Config.user_sim_path)
recompute_index = False if recompute_index: print('building an index for faster search...') for pid in db: p = db[pid] dict_title = makedict(p['title'], forceidf=5, scale=3) dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5) dict_categories = {x['term'].lower():5 for x in p['tags']} if 'and' in dict_authors: # special case for "and" handling in authors list del dict_authors['and'] dict_summary = makedict(p['summary']) SEARCH_DICT[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary]) # and cache it in file print('writing search_dict.p as cache') utils.safe_pickle_dump(SEARCH_DICT, 'search_dict.p') else: print('loading cached index for faster search...') SEARCH_DICT = pickle.load(open('search_dict.p', 'rb')) # start if args.prod: # run on Tornado instead, since running raw Flask in prod is not recommended print('starting tornado!') from tornado import autoreload from tornado.wsgi import WSGIContainer from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop from tornado.log import enable_pretty_logging enable_pretty_logging()
"publushed": j["published"], "authors": [x["name"] for x in j["authors"]], "_version": j["_version"], "category": j["arxiv_primary_category"]["term"] # get full category from taxonomy file } print('Updated %s added %s' % (j['updated'], j['title'])) num_added += 1 num_added_total += 1 else: num_skipped += 1 # print some information print('Added %d papers, already had %d.' % (num_added, num_skipped)) if len(parse.entries) == 0: print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.') print(response) break if num_added == 0 and args.break_on_no_added == 1: print('No new papers were added. Assuming no new papers exist. Exiting.') break print('Sleeping for %i seconds' % (args.wait_time , )) time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit, if we found anything new if num_added_total > 0: print('Saving database with %d papers to %s' % (len(db), args.db_path)) safe_pickle_dump(db, args.db_path)
if not rawid in meta_db or j['_version'] > meta_db[rawid]['_version']: # save a big dictionary j to the database meta_db[rawid] = j #print(j['tags']) print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8'))) num_added += 1 num_added_total += 1 else: num_skipped += 1 # print some information print('Added %d papers, already had %d.' % (num_added, num_skipped)) if len(parse.entries) == 0: print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.') print(response) break if num_added == 0 and args.break_on_no_added == 1: print('No new papers were added. Assuming no new papers exist. Exiting.') break print('Sleeping for %i seconds' % (args.wait_time , )) time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit, if we found anything new if num_added_total > 0: print('Saving database with %d papers to %s' % (len(meta_db), Config.metadata_path)) safe_pickle_dump(meta_db, Config.metadata_path)
from allennlp.commands.elmo import ElmoEmbedder import pickle from utils import Config, safe_pickle_dump import gensim elmo = ElmoEmbedder( options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' ) db = pickle.load(open(Config.db_path, 'rb')) summary_tokens = [] for pid, j in db.items(): # idvv = '%sv%d' % (j['_rawid'], j['_version']) summary = j['summary'].replace('\n', ' ') summary = gensim.utils.simple_preprocess(summary) summary_tokens += summary, print(len(summary_tokens)) elmo_embed = elmo.embed_batch(summary_tokens) safe_pickle_dump(elmo_embed, 'elmo_embed.p')
def save(): print("Now saving..") open("analysis_errors.txt", 'w').writelines([l + '\n' for l in list(analysis_errors)]) safe_pickle_dump(sim_dict, Config.sim_path)
def save_db(db): print('Saving database with %d papers to %s' % (len(db), Config.db_path)) safe_pickle_dump(db, Config.db_path)
max_train)] # crop print("training on %d documents..." % (len(train_txt_paths), )) train_corpus = make_corpus(train_txt_paths) v.fit(train_corpus) # transform print("transforming %d documents..." % (len(txt_paths), )) corpus = make_corpus(txt_paths) X = v.transform(corpus) print(v.vocabulary_) print(X.shape) # write full matrix out out['X'] = X # this one is heavy! print("writing", Config.tfidf_path) safe_pickle_dump(out, Config.tfidf_path) # writing lighter metadata information into a separate (smaller) file out = {} out['vocab'] = v.vocabulary_ out['idf'] = v._tfidf.idf_ out['pids'] = pids # a full idvv string (id and version number) out['ptoi'] = {x: i for i, x in enumerate(pids)} # pid to ix in X mapping print("writing", Config.meta_path) safe_pickle_dump(out, Config.meta_path) print("Precomputing nearest neighbor queries in batches...") for i in range(0, len(pids), batch_size): print(datetime.datetime.now(tz)) i1 = min(len(pids), i + batch_size)
time_posted = (d - epochd).total_seconds() seen[r.id]['time_posted'] = time_posted print('processed %d/%d new tweets. Currently maintaining total %d' % (num_processed, len(results), len(seen))) # maintain state: if something was seen > few days ago, forget it maxdt = 60 * 60 * 24 * max_days_keep seen_new = { tweetid: d for tweetid, d in seen.items() if tnow - d['time_posted'] < maxdt } print('previous seen dict had %d tweets, pruning to %d' % (len(seen), len(seen_new))) seen = seen_new # swap # compile all votes and write output for serving votes = {} for tweetid, d in seen.items(): for pid in d['pids']: votes[pid] = votes.get(pid, 0) + 1 votes = [(v, k) for k, v in votes.items()] votes.sort(reverse=True, key=lambda x: x[0]) # descending print('top votes', votes[:min(len(votes), 10)]) print('writing', Config.tweet_path) safe_pickle_dump(votes, Config.tweet_path) # and sleep for a while print('sleeping', sleep_time) time.sleep(sleep_time)
for ii, u in enumerate(users): print('%d/%d building an SVM for %s' % (ii, len(users), u['username'].encode('utf-8'))) uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print(pids) y = np.zeros(X.shape[0]) for ix in posix: y[ix] = 1 #__init__(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source] clf = svm.LinearSVC(class_weight='auto', verbose=True, max_iter=10000, tol=1e-6, C=1) clf.fit(X, y) s = clf.decision_function(X) sortix = np.argsort(-s) user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)] print('writing user_sim.p') utils.safe_pickle_dump(user_sim, "user_sim.p")
token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b', ngram_range=(1, 2), max_features=10000, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, max_df=1.0, min_df=1) print("training on %d documents..." % (len(txt_list))) v.fit(txt_list) print("transforming %d documents..." % (len(txt_list), )) X = v.transform(txt_list) print("shape of matrix:", X.shape) # write full matrix out out = {} out['X'] = X # this one is heavy! print("writing", utils.Config.tfidf_path) utils.safe_pickle_dump(out, utils.Config.tfidf_path) # writing lighter metadata information into a separate (smaller) file out = {} out['vocab'] = v.vocabulary_ out['idf'] = v._tfidf.idf_ out['fp'] = filepaths print("writing", utils.Config.meta_path) utils.safe_pickle_dump(out, utils.Config.meta_path)
def fetch(args): base_url = 'http://export.arxiv.org/api/query?' db = pickle_load(args.db_path) print('database has {} entries at start'.format(len(db))) if args.date_sort_by == 's': sort_by = 'submittedDate' elif args.date_sort_by == 'u': sort_by = 'lastUpdatedDate' else: print('[Warning] --date-sort-by changed to "lastUpdatedDate"') sort_by = 'lastUpdatedDate' assert args.max_index - args.start_index > 0, 'error index range from {f} to {t}'.format( f=args.start_index, t=args.max_index) num_iter = min(args.max_index - args.start_index, args.results_per_iteration) num_added_total = 0 for i in range(args.start_index, args.max_index, args.results_per_iteration): if args.id_list == 'none': print('Result {} - {}'.format(i, i + num_iter)) query = 'search_query={q}&sortBy={ds}&start={s}&max_results={m}'.format( q=args.search_query, ds=sort_by, s=i, m=num_iter) else: query = 'id_list={}'.format(args.id_list) with urllib.request.urlopen(base_url + query) as url: resp = url.read() parse = feedparser.parse(resp) num_added = 0 for e in parse.entries: j = encode_feedparser_dict(e) rawid, version = parse_arxiv_url(j['id']) j['_rawid'] = rawid j['_version'] = version db, cnt = compare_db(db, j) if cnt == True: num_added += 1 num_added_total += 1 if (len(parse.entries) == 0): print('Received no results from arXiv.') print(resp) if args.break_on_no_added == 0: pass else: break if num_added == 0: print('No more new papers.') if args.id_list != 'none': break print('Sleeping for {} seconds'.format(args.wait_time)) time.sleep(args.wait_time + random.uniform(0, 3)) if num_added_total > 0: print('Saving database with {n} papers to {p}'.format(n=len(db), p=args.db_path)) safe_pickle_dump(db, args.db_path)
max_features=20000, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False, ) X = v.fit_transform(txts) print v.vocabulary_ print X.shape # write full matrix out out = {} out["X"] = X # this one is heavy! print ("writing tfidf.p") utils.safe_pickle_dump(out, "tfidf.p") # writing lighter metadata information into a separate (smaller) file out = {} out["vocab"] = v.vocabulary_ out["idf"] = v._tfidf.idf_ out["pids"] = pids # a full idvv string (id and version number) out["ptoi"] = {x: i for i, x in enumerate(pids)} # pid to ix in X mapping print ("writing tfidf_meta.p") utils.safe_pickle_dump(out, "tfidf_meta.p") print "precomputing nearest neighbor queries in batches..." X = X.todense() # originally it's a sparse matrix sim_dict = {} batch_size = 200 for i in xrange(0, len(pids), batch_size):
def analyze(csv_file, txt_dir): db = read_csv(csv_file) # read all text files for all papers into memory txt_paths, pids = [], [] n = 0 for idvv in db: n += 1 # idvv = '%sv%d' % (j['_rawid'], j['_version']) # merged text_path txt_path = os.path.join(txt_dir, idvv + '.txt') if os.path.isfile(txt_path): # some pdfs dont translate to txt with open(txt_path, 'r') as f: txt = f.read() if len(txt) > 1000 and len( txt) < 500000: # 500K is VERY conservative upper bound txt_paths.append( txt_path ) # todo later: maybe filter or something some of them pids.append(idvv) print("read %d/%d (%s) with %d chars" % (n, len(db), idvv, len(txt))) else: print("skipped %d/%d (%s) with %d chars: suspicious!" % (n, len(db), idvv, len(txt))) else: print("could not find %s in txt folder." % (txt_path, )) print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db))) # compute tfidf vectors with scikits v = TfidfVectorizer(input='content', encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, analyzer='word', stop_words='english', token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b', ngram_range=(1, 2), max_features=max_features, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, max_df=1.0, min_df=1) # create an iterator object to conserve memory def make_corpus(paths): for p in paths: with open(p, 'r') as f: txt = f.read() yield txt # train train_txt_paths = list(txt_paths) # duplicate shuffle(train_txt_paths) # shuffle train_txt_paths = train_txt_paths[:min(len(train_txt_paths), max_train)] # crop print("training on %d documents..." % (len(train_txt_paths), )) train_corpus = make_corpus(train_txt_paths) v.fit(train_corpus) # transform print("transforming %d documents..." % (len(txt_paths), )) corpus = make_corpus(txt_paths) X = v.transform(corpus) print(v.vocabulary_) print(X.shape) # write full matrix out out = {} out['X'] = X # this one is heavy! print("writing", Config.tfidf_path) safe_pickle_dump(out, Config.tfidf_path) # writing lighter metadata information into a separate (smaller) file out = {} out['vocab'] = v.vocabulary_ out['idf'] = v._tfidf.idf_ out['pids'] = pids # a full idvv string (id and version number) out['ptoi'] = {x: i for i, x in enumerate(pids)} # pid to ix in X mapping print("writing", Config.meta_path) safe_pickle_dump(out, Config.meta_path) print("precomputing nearest neighbor queries in batches...") X = X.todense() # originally it's a sparse matrix sim_dict = {} batch_size = 200 for i in range(0, len(pids), batch_size): i1 = min(len(pids), i + batch_size) xquery = X[i:i1] # BxD ds = -np.asarray(np.dot(X, xquery.T)) #NxD * DxB => NxB IX = np.argsort(ds, axis=0) # NxB for j in range(i1 - i): sim_dict[pids[i + j]] = [pids[q] for q in list(IX[:50, j])] print('%d/%d...' % (i, len(pids))) print("writing", Config.sim_path) # safe_pickle_dump(sim_dict, Config.sim_path) write_json(os.path.join(txt_dir, 'sim_dict.json'), sim_dict)
# oom killer was here v.fit(train_corpus) # transform print("transforming %d documents..." % (len(txt_paths),)) corpus = make_corpus(txt_paths) print("created corpus") X = v.transform(corpus) # print(v.vocabulary_) print(X.shape) # write full matrix out out = {} out["X"] = X # this one is heavy! print("writing", Config.tfidf_path) safe_pickle_dump(out, Config.tfidf_path) # writing lighter metadata information into a separate (smaller) file out = {} out["vocab"] = v.vocabulary_ out["idf"] = v._tfidf.idf_ out["pids"] = pids # a full idvv string (id and version number) out["ptoi"] = {x: i for i, x in enumerate(pids)} # pid to ix in X mapping print("writing", Config.meta_path) safe_pickle_dump(out, Config.meta_path) print("precomputing nearest neighbor queries in batches...") X = X.todense() # originally it's a sparse matrix sim_dict = {} batch_size = 100 for i in range(0, len(pids), batch_size):
def parse_xml(response): # lets load the existing database to memory try: db = pickle.load(open(Config.db_path, 'rb')) except Exception as e: print('error loading existing database:') print(e) print('starting from an empty database') db = {} print('database has %d entries at start' % (len(db), )) OAI = "{http://www.openarchives.org/OAI/2.0/}" ARXIV = "{http://arxiv.org/OAI/arXivRaw/}" parse = objectify.parse(response) num_added = 0 num_skipped = 0 root = parse.getroot() record_list = root.find(OAI + 'ListRecords').findall(OAI + "record") for record in record_list: info = record.metadata.find(ARXIV + "arXivRaw") versions = info.findall(ARXIV + "version") version_num = len(versions) published_version = info.find(ARXIV + "version[@version='v1']") latest_version = info.find(ARXIV + "version[@version='v" + str(version_num) + "']") published_parsed = dateutil.parser.parse(published_version.date.text) updated_parsed = dateutil.parser.parse(latest_version.date.text) published = published_parsed.strftime('%Y-%m-%d') updated = updated_parsed.strftime('%Y-%m-%d') authors = [] author_list = info.authors.text.replace(', and ', ', ') author_list = info.authors.text.replace(' and ', ', ') author_list = author_list.split(', ') for author in author_list: authors.append({'name': author}) cats = info.categories.text.split() primary_cat = {'term': cats[0]} tags = [] for cat in cats: tags.append({'term': cat}) rawid = info.id.text id_url = 'http://arxiv.org/abs/' + info.id.text if hasattr(info, 'doi'): doi = info.doi.text else: doi = '' if hasattr(info, 'journal-ref'): journal = info.find(ARXIV + 'journal-ref').text else: journal = '' if hasattr(info, 'comments'): comment = info.find(ARXIV + 'comments').text else: comment = '' links = [{ 'href': 'http://arxiv.org/abs/' + rawid + 'v' + str(version_num), 'rel': 'alternate', 'type': 'text/html' }, { 'href': 'http://arxiv.org/pdf/' + rawid + 'v' + str(version_num), 'rel': 'related', 'title': 'pdf', 'type': 'application/pdf' }] j = { 'published': published, 'updated': updated, 'updated_parsed': updated_parsed, 'published_parsed': published_parsed, 'authors': authors, 'tags': tags, 'arxiv_primary_category': primary_cat, 'arxiv_doi': doi, 'arxiv_journal_ref': journal, 'id': id_url, 'link': id_url, 'links': links, '_rawid': rawid, '_version': version_num, 'title': info.title.text, 'summary': info.abstract.text, 'arxiv_comment': comment, } # add to our database if we didn't have it before, or if this is a new version if not rawid in db or j['_version'] > db[rawid]['_version']: db[rawid] = j print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8'))) num_added += 1 else: num_skipped += 1 # print some information print('Added %d papers, already had %d.' % (num_added, num_skipped)) # save the database before we quit, if we found anything new print('Saving database with %d papers to %s' % (len(db), Config.db_path)) safe_pickle_dump(db, Config.db_path) return
num_skipped += 1 #--------------------------抓取主循环结束,输出统计数据 Step 5-------------------------- # print some information print('Added %d papers, already had %d.' % (num_added, num_skipped)) #抓取受限时,能否考虑等待一定时间继续? ''' if len(parse.entries) == 0: #原程序为直接终止 print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.') print(response) break ''' if num_added == 0 and args.break_on_no_added == 1: print( 'No new papers were added. Assuming no new papers exist. Exiting.' ) break print('Sleeping for %i seconds' % (args.wait_time, )) time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit, if we found anything new if num_added_total > 0: print('Saving database with %d papers to %s' % (len(db), Config.db_path)) safe_pickle_dump(db, Config.db_path) #导出到外置存储
txt_path = os.path.join('data', 'txt', idvv) + '.pdf.txt' if os.path.isfile(txt_path): # some pdfs dont translate to txt with open(txt_path, 'r') as f: txt = f.read() if len(txt) > 1000 and len(txt) < 500000: # 500K is VERY conservative upper bound txt_paths.append(txt_path) # todo later: maybe filter or something some of them pids.append(idvv) print("read %d/%d (%s) with %d chars" % (n, len(db), idvv, len(txt))) trainedpapers[pid] = j else: print("skipped %d/%d (%s) with %d chars: suspicious!" % (n, len(db), idvv, len(txt))) else: print("could not find %s in txt folder." % (txt_path, )) print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db))) print("writing ", Config.trained_path) safe_pickle_dump(trainedpapers, Config.trained_path) # compute tfidf vectors with scikits vectorizer = TfidfVectorizer(input='content', encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, analyzer='word', stop_words='english', token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b', norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, max_df=1.0, min_df=1) # build an SVD model, n_components = 100 is chosen in random svd_model = TruncatedSVD(n_components=100, algorithm='randomized', n_iter=10, random_state=42) # create an iterator object to conserve memory
def main(): # parser = argparse.ArgumentParser() # ## Required parameters # parser.add_argument("--input_file", default=None, type=str, required=True) # parser.add_argument("--output_file", default=None, type=str, required=True) # parser.add_argument("--bert_model", default=None, type=str, required=True, # help="Bert pre-trained model selected in the list: bert-base-uncased, " # "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") # ## Other parameters # parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") # parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) # parser.add_argument("--max_seq_length", default=128, type=int, # help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " # "than this will be truncated, and sequences shorter than this will be padded.") # parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") # parser.add_argument("--local_rank", # type=int, # default=-1, # help = "local_rank for distributed training on gpus") # parser.add_argument("--no_cuda", # action='store_true', # help="Whether not to use CUDA when available") # args = parser.parse_args() local_rank = -1 # TODO: change this no_cuda = True # layers = "-1,-2,-3,-4" # layers = "-1,-2,-3,-4" bert_model = 'bert-base-uncased' do_lower_case = True max_seq_length = 150 batch_size = 32 feature_size = 768 if local_rank == -1 or no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(local_rank != -1))) # layer_indexes = [int(x) for x in layers.split(",")] tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) model = BertModel.from_pretrained(bert_model) model.to(device) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) examples = [] # 2100+ papers for pid, j in db.items(): # idvv = '%sv%d' % (j['_rawid'], j['_version']) summary = j['summary'].replace('\n', ' ') examples += InputExample(pid, summary), features = convert_examples_to_features(examples=examples, seq_length=max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) model.eval() layer_index = -2 bert_out = collections.OrderedDict() for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) for b, example_index in enumerate(example_indices): print("Example_index: ", example_index) feature = features[example_index.item()] unique_id = feature.unique_id feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] # 128*768 # average pooling # sentence_embedding = np.mean(layer_output, 0) # sentence_embedding = [ # round(x.item(), 3) for x in sentence_embedding # ] print(unique_id, "bert out") bert_out[unique_id] = np.round(layer_output.flatten(), 3) safe_pickle_dump(bert_out, 'bert_out_big.p')
# extract just the raw arxiv id and version for this paper rawid, version = parse_arxiv_url(j['id']) j['_rawid'] = rawid j['_version'] = version # add to our database if we didn't have it before, or if this is a new version if not rawid in db or j['_version'] > db[rawid]['_version']: db[rawid] = j print('updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8'))) num_added += 1 else: num_skipped += 1 # print some information print('Added %d papers, already had %d.' % (num_added, num_skipped)) if len(parse.entries) == 0: print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.') print(response) break if num_added == 0 and args.break_on_no_added == 1: print('No new papers were added. Assuming no new papers exist. Exiting.') break print('Sleeping for %i seconds' % (args.wait_time , )) time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit print('saving database with %d papers to %s' % (len(db), args.db_path)) utils.safe_pickle_dump(db, args.db_path)
for entry in map(encode_feedparser_dict, feed.entries): # extract just the raw arxiv id and version for this paper rawid, version = parse_arxiv_url(entry["id"]) entry["_rawid"], entry["_version"] = rawid, version # add to our database if we didn't have it before, or if this is a new version if rawid not in paper_db or version > paper_db[rawid]["_version"]: paper_db[rawid] = entry print(f"Updated {entry['updated']} added {entry['title']}") num_added += 1 else: num_skipped += 1 # print some information print("Added %d papers, already had %d." % (num_added, num_skipped)) num_added_total += num_added if num_added == 0 and args.break_on_no_added == 1: print("No new papers were added. Assuming " "no new papers exist. Exiting.") break print(f"Sleeping for {args.wait_time} seconds") time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit, if we found anything new if num_added_total > 0: print( f"Saving database with {len(paper_db)} papers to {Config.db_path}") safe_pickle_dump(paper_db, Config.db_path)
X = X.todense() xtoi = { strip_version(x):i for x,i in meta['ptoi'].iteritems() } user_sim = {} for ii,u in enumerate(users): print '%d/%d building an SVM for %s' % (ii, len(users), u['username'].encode('utf-8')) uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print pids y = np.zeros(X.shape[0]) for ix in posix: y[ix] = 1 #__init__(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source] clf = svm.LinearSVC(class_weight='auto', verbose=True, max_iter=10000, tol=1e-6, C=1) clf.fit(X,y) s = clf.decision_function(X) sortix = np.argsort(-s) user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)] print 'writing user_sim.p' utils.safe_pickle_dump(user_sim, "user_sim.p")
j['_rawid'] = rawid j['_version'] = version # add to our database if we didn't have it before, or if this is a new version if not rawid in db or j['_version'] > db[rawid]['_version']: db[rawid] = j print 'updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')) num_added += 1 else: num_skipped += 1 # print some information print 'Added %d papers, already had %d.' % (num_added, num_skipped) if len(parse.entries) == 0: print 'Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.' print response break if num_added == 0 and args.break_on_no_added == 1: print 'No new papers were added. Assuming no new papers exist. Exiting.' break print 'Sleeping for %i seconds' % (args.wait_time, ) time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit print 'saving database with %d papers to %s' % (len(db), args.db_path) utils.safe_pickle_dump(db, args.db_path)
recompute_index = False if recompute_index: print 'building an index for faster search...' for pid in db: p = db[pid] dict_title = makedict(p['title'], forceidf=5, scale=3) dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5) dict_categories = {x['term'].lower():5 for x in p['tags']} if 'and' in dict_authors: # special case for "and" handling in authors list del dict_authors['and'] dict_summary = makedict(p['summary']) SEARCH_DICT[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary]) # and cache it in file print 'writing search_dict.p as cache' utils.safe_pickle_dump(SEARCH_DICT, 'search_dict.p') else: print 'loading cached index for faster search...' SEARCH_DICT = pickle.load(open('search_dict.p', 'rb')) # start if args.prod: # run on Tornado instead, since running raw Flask in prod is not recommended print 'starting tornado!' from tornado.wsgi import WSGIContainer from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop from tornado.log import enable_pretty_logging enable_pretty_logging() http_server = HTTPServer(WSGIContainer(app)) http_server.listen(args.port)
print(f"training on {n_train} documents...") # duplicate, shuffle, split and train, then transform train_txt_paths = list(txt_paths) shuffle(train_txt_paths) v.fit(train_txt_paths[:n_train]) print(f"transforming {len(txt_paths)} documents...") X = v.transform(txt_paths) print(v.vocabulary_) print(X.shape) # write full matrix out, this one is heavy! print("writing", Config.tfidf_path) safe_pickle_dump({"X": X}, Config.tfidf_path) # writing lighter metadata information into a separate (smaller) file print("writing", Config.meta_path) safe_pickle_dump( { 'vocab': v.vocabulary_, 'idf': v._tfidf.idf_, 'pids': pids, # a full idvv string (id and version number) 'ptoi': {x: i for i, x in enumerate(pids)} # pid to ix in X mapping }, Config.meta_path) print("precomputing nearest neighbor queries in batches...") X = X.todense() # originally it's a sparse matrix
def dump_rawid(rawid,dumpdic): db = pickle.load(open(Config.db_path, 'rb')) if not rawid in db or j['_version'] > db[rawid]['_version']: db[rawid]=dumpdic safe_pickle_dump(db, Config.db_path)
print("training on %d documents..." % (len(train_txt_paths), )) train_corpus = make_corpus(train_txt_paths) v.fit(train_corpus) # transform print("transforming %d documents..." % (len(txt_paths), )) corpus = make_corpus(txt_paths) X = v.transform(corpus) pprint(v.vocabulary_) print(X.shape) # write full matrix out out = {} out['X'] = X # this one is heavy! print("writing", Config.tfidf_path) safe_pickle_dump(out, Config.tfidf_path) # writing lighter metadata information into a separate (smaller) file out = {} out['vocab'] = v.vocabulary_ out['idf'] = v._tfidf.idf_ out['pids'] = pids # a full idvv string (id and version number) out['ptoi'] = {x: i for i, x in enumerate(pids)} # pid to ix in X mapping print("writing", Config.meta_path) safe_pickle_dump(out, Config.meta_path) print("precomputing nearest neighbor queries in batches...") X = X.todense() # originally it's a sparse matrix sim_dict = {} batch_size = 200 for i in range(0, len(pids), batch_size):
print('building an index for faster search...') for pid in db: p = db[pid] dict_title = makedict(p['title'], forceidf=5, scale=3) dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5) dict_categories = {x['term'].lower(): 5 for x in p['tags']} if 'and' in dict_authors: # special case for "and" handling in authors list del dict_authors['and'] dict_summary = makedict(p['summary']) SEARCH_DICT[pid] = merge_dicts( [dict_title, dict_authors, dict_categories, dict_summary]) # and cache it in file print('writing ', Config.search_dict_path, ' as cache...') safe_pickle_dump(SEARCH_DICT, Config.search_dict_path) else: print('loading cached index for faster search from', Config.search_dict_path) SEARCH_DICT = pickle.load(open(Config.search_dict_path, 'rb')) # start if args.prod: # run on Tornado instead, since running raw Flask in prod is not recommended print('starting tornado!') from tornado.wsgi import WSGIContainer from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop from tornado.log import enable_pretty_logging enable_pretty_logging() http_server = HTTPServer(WSGIContainer(app))
# add to our database if we didn't have it before, or if this is a new version if not rawid in db or j['_version'] > db[rawid]['_version']: db[rawid] = j print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8'))) num_added += 1 num_added_total += 1 else: num_skipped += 1 # print some information print('Added %d papers, already had %d.' % (num_added, num_skipped)) if len(parse.entries) == 0: print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.') print(response) break if num_added == 0 and args.break_on_no_added == 1: print('No new papers were added. Assuming no new papers exist. Exiting.') break print('Sleeping for %i seconds' % (args.wait_time , )) time.sleep(args.wait_time + random.uniform(0, 3)) # save the database before we quit, if we found anything new if num_added_total > 0: print('Saving database with %d papers to %s' % (len(db), Config.db_path)) safe_pickle_dump(db, Config.db_path)
(n, len(db), idvv, len(txt))) else: print("could not find %s in txt folder." % (txt_path, )) print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db))) print("precomputing nearest neighbor queries in batches...") # X = X.todense() # originally it's a sparse matrix sim_dict = {} # batch_size = 200 # for i in range(0,len(pids),batch_size): # i1 = min(len(pids), i+batch_size) # xquery = X[i:i1] # BxD # ds = -np.asarray(np.dot(X, xquery.T)) #NxD * DxB => NxB # IX = np.argsort(ds, axis=0) # NxB # for j in range(i1-i): # sim_dict[pids[i+j]] = [pids[q] for q in list(IX[:50,j])] # print('%d/%d...' % (i, len(pids))) model = Doc2Vec.load("d2v.model") for pid in pids: tmp = [] try: tmp = model.docvecs.most_similar(pid) except: tmp = [] sim_dict[pid] = [sim_pid for sim_pid, distance in tmp] print("writing", Config.sim_path) safe_pickle_dump(sim_dict, Config.sim_path)