def generate_tfidf_pickles():
    """Gets all the read articles and considers those articles flagged as 's' as 1 and rest as 0 
    and produces the embeddings
    """
    sqldb = connect_db(db_path)
    records = query_db(sqldb, '''select feedurl, author, id, title, content, flags from rss_item where unread=0 order by pubDate DESC;''')
    content_list = []
    outcome_list = []
    id_list = []
    title_list = []
    for record in records:
        # We should not judge the book by it's cover
        content_list.append('||'+ record['feedurl'] + '|| \n ||' + record['author'] + '|| \n ||' + record['title'] + '|| \n' + record['content'])
        outcome_list.append((record['flags'] is not None and 'r' not in record['flags'] and 's' in record['flags']) * 1)
        id_list.append(record['id'])
        # Yes, we are judging the book by it's cover but we are using the cool NLP model to judge
        title_list.append(record['title']) 
    print("Total %d feed items found" %(len(content_list)))
    print(content_list[0])
    # compute tfidf vectors with scikits
    v = TfidfVectorizer(input='content', 
            encoding='utf-8', decode_error='replace', strip_accents='unicode', 
            lowercase=True, analyzer='word', stop_words='english', 
            token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
            ngram_range=(1, 2), max_features = max_features, 
            norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
            max_df=1.0, min_df=1)
    v.fit(content_list)
    print("Projecting them to a mathematical space..")
    X_tfidf = v.transform(content_list)
    X_smart = cool_nlp_model.encode(title_list)
    out = {}
    out['X_tfidf'] = X_tfidf
    out['X_smart'] = X_smart
    out['y'] = outcome_list
    out['v'] = v
    #print("writing", tfidf_path)
    safe_pickle_dump(out, tfidf_path)
    out = {}
    out['vocab'] = v.vocabulary_
    out['idf'] = v._tfidf.idf_
    out['ids'] = id_list
    out['idtoi'] = {x:i for i,x in enumerate(id_list)}
    #print("Writing Meta Data")
    safe_pickle_dump(out, meta_path)
def build_model(meta_path, tfidf_path):
    """
        Given the embeddings, generate our preferences
        model using support vector machines
    """
    meta = pickle.load(open(meta_path, 'rb'))
    out = pickle.load(open(tfidf_path, 'rb'))
    X_tfidf = out['X_tfidf']
    X_tfidf = X_tfidf.todense().astype(np.float32)
    y = out['y']
    y = np.array(y).astype(np.float32)
    X_smart = out['X_smart']
    print('Learning your preferences...')
    clf = LinearSVC(class_weight='balanced', verbose=False, max_iter=1000000, tol=1e-6, C=0.1)
    clf.fit(X_tfidf, y)
    beclf = LinearSVC(class_weight='balanced', verbose=False, max_iter=1000000, tol=1e-6)
    beclf.fit(X_smart, y)
    model = {}
    model['db_name'] = db_path
    model['clf'] = clf
    model['beclf'] = beclf
    safe_pickle_dump(model, model_path)
Beispiel #3
0

def merge_dicts(dlist):
    m = {}
    for d in dlist:
        for k, v in d.items():
            m[k] = m.get(k, 0) + v
    return m


print('building an index for faster search...')
search_dict = {}
for pid, p in db.items():
    dict_title = makedict(p['title'], forceidf=5, scale=3)
    dict_authors = makedict(' '.join(x['name'] for x in p['authors']),
                            forceidf=5)
    dict_categories = {x['term'].lower(): 5 for x in p['tags']}
    if 'and' in dict_authors:
        # special case for "and" handling in authors list
        del dict_authors['and']
    dict_summary = makedict(p['summary'])
    search_dict[pid] = merge_dicts(
        [dict_title, dict_authors, dict_categories, dict_summary])
CACHE['search_dict'] = search_dict

# save the cache
print('writing', Config.serve_cache_path)
safe_pickle_dump(CACHE, Config.serve_cache_path)
print('writing', Config.db_serve_path)
safe_pickle_dump(db, Config.db_serve_path)
Beispiel #4
0
v = TfidfVectorizer(input='content', 
        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
        lowercase=True, analyzer='word', stop_words='english', 
        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
        ngram_range=(1, 2), max_features = 20000, 
        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

X = v.fit_transform(txts)
print v.vocabulary_
print X.shape

# write full matrix out
out = {}
out['X'] = X # this one is heavy!
print('writing tfidf.p')
utils.safe_pickle_dump(out, "tfidf.p")

# writing lighter metadata information into a separate (smaller) file
out = {}
out['vocab'] = v.vocabulary_
out['idf'] = v._tfidf.idf_
out['pids'] = pids # a full idvv string (id and version number)
out['ptoi'] = { x:i for i,x in enumerate(pids) } # pid to ix in X mapping
print('writing tfidf_meta.p')
utils.safe_pickle_dump(out, "tfidf_meta.p")

print 'precomputing nearest neighbor queries in batches...'
X = X.todense() # originally it's a sparse matrix
sim_dict = {}
batch_size = 200
for i in xrange(0,len(pids),batch_size):
Beispiel #5
0
def merge_dicts(dlist):
    m = {}
    for d in dlist:
        for k, v in d.items():
            m[k] = m.get(k, 0) + v
    return m


pdf_files_dict = pdf_filewatcher.get_saved_pdf_files()

print('building an index for faster search...')
search_dict = {}
for fp in pdf_files_dict:
    di = pdf_files_dict[fp]
    filename = di['filename']
    tscore = di['tscore']
    paper_date_modified = di['date_modified']
    clean_filename = filename.split('.')[0]
    paper_text = di['txt']
    if paper_text is None:
        paper_text = ''

    dict_title = makedict(clean_filename, forceidf=5, scale=3)
    dict_text = makedict(paper_text)

    final_dict = merge_dicts([dict_title, dict_text])
    search_dict[fp] = final_dict

print('writing search dict', utils.Config.search_dict_path)
utils.safe_pickle_dump(search_dict, utils.Config.search_dict_path)
X = out['X']
X = X.todense()

xtoi = { strip_version(x):i for x,i in meta['ptoi'].items() }

user_sim = {}
for ii,u in enumerate(users):
  print("%d/%d building an SVM for %s" % (ii, len(users), u['username'].encode('utf-8')))
  uid = u['user_id']
  lib = query_db('''select * from library where user_id = ?''', [uid])
  pids = [x['paper_id'] for x in lib] # raw pids without version
  posix = [xtoi[p] for p in pids if p in xtoi]
  
  if not posix:
    continue # empty library for this user maybe?

  print(pids)
  y = np.zeros(X.shape[0])
  for ix in posix: y[ix] = 1

  clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
  clf.fit(X,y)
  s = clf.decision_function(X)

  sortix = np.argsort(-s)
  sortix = sortix[:min(num_recommendations, len(sortix))] # crop paper recommendations to save space
  user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)]

print('writing', Config.user_sim_path)
safe_pickle_dump(user_sim, Config.user_sim_path)
Beispiel #7
0
    print("%d/%d building an SVM for %s" %
          (ii, len(users), u['username'].encode('utf-8')))
    uid = u['user_id']
    lib = query_db('''select * from library where user_id = ?''', [uid])
    pids = [x['paper_id'] for x in lib]  # raw pids without version
    posix = [xtoi[p] for p in pids if p in xtoi]

    if not posix:
        continue  # empty library for this user maybe?

    print(pids)
    y = np.zeros(X.shape[0])
    for ix in posix:
        y[ix] = 1

    clf = svm.LinearSVC(class_weight='balanced',
                        verbose=False,
                        max_iter=10000,
                        tol=1e-6,
                        C=0.1)
    clf.fit(X, y)
    s = clf.decision_function(X)

    sortix = np.argsort(-s)
    # crop paper recommendations to save space
    sortix = sortix[:min(num_recommendations, len(sortix))]
    user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)]

print('writing', Config.user_sim_path)
safe_pickle_dump(user_sim, Config.user_sim_path)
Beispiel #8
0
      recompute_index = False
  if recompute_index:
    print('building an index for faster search...')
    for pid in db:
      p = db[pid]
      dict_title = makedict(p['title'], forceidf=5, scale=3)
      dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5)
      dict_categories = {x['term'].lower():5 for x in p['tags']}
      if 'and' in dict_authors: 
        # special case for "and" handling in authors list
        del dict_authors['and']
      dict_summary = makedict(p['summary'])
      SEARCH_DICT[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary])
    # and cache it in file
    print('writing search_dict.p as cache')
    utils.safe_pickle_dump(SEARCH_DICT, 'search_dict.p')
  else:
    print('loading cached index for faster search...')
    SEARCH_DICT = pickle.load(open('search_dict.p', 'rb'))

  # start
  if args.prod:
    # run on Tornado instead, since running raw Flask in prod is not recommended
    print('starting tornado!')
    from tornado import autoreload
    from tornado.wsgi import WSGIContainer
    from tornado.httpserver import HTTPServer
    from tornado.ioloop import IOLoop
    from tornado.log import enable_pretty_logging

    enable_pretty_logging()
Beispiel #9
0
          "publushed": j["published"],
          "authors": [x["name"] for x in j["authors"]],
          "_version": j["_version"],
          "category": j["arxiv_primary_category"]["term"] # get full category from taxonomy file
        }
        print('Updated %s added %s' % (j['updated'], j['title']))
        num_added += 1
        num_added_total += 1
      else:
        num_skipped += 1

    # print some information
    print('Added %d papers, already had %d.' % (num_added, num_skipped))

    if len(parse.entries) == 0:
      print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
      print(response)
      break

    if num_added == 0 and args.break_on_no_added == 1:
      print('No new papers were added. Assuming no new papers exist. Exiting.')
      break

    print('Sleeping for %i seconds' % (args.wait_time , ))
    time.sleep(args.wait_time + random.uniform(0, 3))

  # save the database before we quit, if we found anything new
  if num_added_total > 0:
    print('Saving database with %d papers to %s' % (len(db), args.db_path))
    safe_pickle_dump(db, args.db_path)
      if not rawid in meta_db or j['_version'] > meta_db[rawid]['_version']:
        # save a big dictionary j to the database
        meta_db[rawid] = j
        #print(j['tags'])
        print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
        num_added += 1
        num_added_total += 1
      else:
        num_skipped += 1

    # print some information
    print('Added %d papers, already had %d.' % (num_added, num_skipped))

    if len(parse.entries) == 0:
      print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
      print(response)
      break

    if num_added == 0 and args.break_on_no_added == 1:
      print('No new papers were added. Assuming no new papers exist. Exiting.')
      break

    print('Sleeping for %i seconds' % (args.wait_time , ))
    time.sleep(args.wait_time + random.uniform(0, 3))

  # save the database before we quit, if we found anything new
  if num_added_total > 0:
    print('Saving database with %d papers to %s' % (len(meta_db), Config.metadata_path))
    safe_pickle_dump(meta_db, Config.metadata_path)

from allennlp.commands.elmo import ElmoEmbedder
import pickle
from utils import Config, safe_pickle_dump
import gensim
elmo = ElmoEmbedder(
    options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json',
    weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
)

db = pickle.load(open(Config.db_path, 'rb'))
summary_tokens = []
for pid, j in db.items():
    # idvv = '%sv%d' % (j['_rawid'], j['_version'])
    summary = j['summary'].replace('\n', ' ')
    summary = gensim.utils.simple_preprocess(summary)
    summary_tokens += summary,
print(len(summary_tokens))
elmo_embed = elmo.embed_batch(summary_tokens)
safe_pickle_dump(elmo_embed, 'elmo_embed.p')
Beispiel #12
0
def save():
    print("Now saving..")
    open("analysis_errors.txt",
         'w').writelines([l + '\n' for l in list(analysis_errors)])
    safe_pickle_dump(sim_dict, Config.sim_path)
def save_db(db):
  print('Saving database with %d papers to %s' % (len(db), Config.db_path))
  safe_pickle_dump(db, Config.db_path)
Beispiel #14
0
                                       max_train)]  # crop
print("training on %d documents..." % (len(train_txt_paths), ))
train_corpus = make_corpus(train_txt_paths)
v.fit(train_corpus)

# transform
print("transforming %d documents..." % (len(txt_paths), ))
corpus = make_corpus(txt_paths)
X = v.transform(corpus)
print(v.vocabulary_)
print(X.shape)

# write full matrix out
out['X'] = X  # this one is heavy!
print("writing", Config.tfidf_path)
safe_pickle_dump(out, Config.tfidf_path)

# writing lighter metadata information into a separate (smaller) file
out = {}
out['vocab'] = v.vocabulary_
out['idf'] = v._tfidf.idf_
out['pids'] = pids  # a full idvv string (id and version number)
out['ptoi'] = {x: i for i, x in enumerate(pids)}  # pid to ix in X mapping
print("writing", Config.meta_path)
safe_pickle_dump(out, Config.meta_path)

print("Precomputing nearest neighbor queries in batches...")

for i in range(0, len(pids), batch_size):
    print(datetime.datetime.now(tz))
    i1 = min(len(pids), i + batch_size)
        time_posted = (d - epochd).total_seconds()
        seen[r.id]['time_posted'] = time_posted

    print('processed %d/%d new tweets. Currently maintaining total %d' %
          (num_processed, len(results), len(seen)))

    # maintain state: if something was seen > few days ago, forget it
    maxdt = 60 * 60 * 24 * max_days_keep
    seen_new = {
        tweetid: d
        for tweetid, d in seen.items() if tnow - d['time_posted'] < maxdt
    }
    print('previous seen dict had %d tweets, pruning to %d' %
          (len(seen), len(seen_new)))
    seen = seen_new  # swap

    # compile all votes and write output for serving
    votes = {}
    for tweetid, d in seen.items():
        for pid in d['pids']:
            votes[pid] = votes.get(pid, 0) + 1
    votes = [(v, k) for k, v in votes.items()]
    votes.sort(reverse=True, key=lambda x: x[0])  # descending
    print('top votes', votes[:min(len(votes), 10)])
    print('writing', Config.tweet_path)
    safe_pickle_dump(votes, Config.tweet_path)

    # and sleep for a while
    print('sleeping', sleep_time)
    time.sleep(sleep_time)
Beispiel #16
0
for ii, u in enumerate(users):
    print('%d/%d building an SVM for %s' %
          (ii, len(users), u['username'].encode('utf-8')))
    uid = u['user_id']
    lib = query_db('''select * from library where user_id = ?''', [uid])
    pids = [x['paper_id'] for x in lib]  # raw pids without version
    posix = [xtoi[p] for p in pids if p in xtoi]

    if not posix:
        continue  # empty library for this user maybe?

    print(pids)
    y = np.zeros(X.shape[0])
    for ix in posix:
        y[ix] = 1

    #__init__(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source]
    clf = svm.LinearSVC(class_weight='auto',
                        verbose=True,
                        max_iter=10000,
                        tol=1e-6,
                        C=1)
    clf.fit(X, y)
    s = clf.decision_function(X)

    sortix = np.argsort(-s)
    user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)]

print('writing user_sim.p')
utils.safe_pickle_dump(user_sim, "user_sim.p")
Beispiel #17
0
                    token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
                    ngram_range=(1, 2),
                    max_features=10000,
                    norm='l2',
                    use_idf=True,
                    smooth_idf=True,
                    sublinear_tf=True,
                    max_df=1.0,
                    min_df=1)

print("training on %d documents..." % (len(txt_list)))
v.fit(txt_list)

print("transforming %d documents..." % (len(txt_list), ))
X = v.transform(txt_list)
print("shape of matrix:", X.shape)

# write full matrix out
out = {}
out['X'] = X  # this one is heavy!
print("writing", utils.Config.tfidf_path)
utils.safe_pickle_dump(out, utils.Config.tfidf_path)

# writing lighter metadata information into a separate (smaller) file
out = {}
out['vocab'] = v.vocabulary_
out['idf'] = v._tfidf.idf_
out['fp'] = filepaths
print("writing", utils.Config.meta_path)
utils.safe_pickle_dump(out, utils.Config.meta_path)
Beispiel #18
0
def fetch(args):
    base_url = 'http://export.arxiv.org/api/query?'

    db = pickle_load(args.db_path)
    print('database has {} entries at start'.format(len(db)))

    if args.date_sort_by == 's':
        sort_by = 'submittedDate'
    elif args.date_sort_by == 'u':
        sort_by = 'lastUpdatedDate'
    else:
        print('[Warning] --date-sort-by changed to "lastUpdatedDate"')
        sort_by = 'lastUpdatedDate'

    assert args.max_index - args.start_index > 0, 'error index range from {f} to {t}'.format(
        f=args.start_index, t=args.max_index)
    num_iter = min(args.max_index - args.start_index,
                   args.results_per_iteration)
    num_added_total = 0

    for i in range(args.start_index, args.max_index,
                   args.results_per_iteration):
        if args.id_list == 'none':
            print('Result {} - {}'.format(i, i + num_iter))
            query = 'search_query={q}&sortBy={ds}&start={s}&max_results={m}'.format(
                q=args.search_query, ds=sort_by, s=i, m=num_iter)
        else:
            query = 'id_list={}'.format(args.id_list)

        with urllib.request.urlopen(base_url + query) as url:
            resp = url.read()
        parse = feedparser.parse(resp)
        num_added = 0

        for e in parse.entries:
            j = encode_feedparser_dict(e)

            rawid, version = parse_arxiv_url(j['id'])
            j['_rawid'] = rawid
            j['_version'] = version

            db, cnt = compare_db(db, j)

            if cnt == True:
                num_added += 1
                num_added_total += 1

        if (len(parse.entries) == 0):
            print('Received no results from arXiv.')
            print(resp)
            if args.break_on_no_added == 0:
                pass
            else:
                break

        if num_added == 0:
            print('No more new papers.')

        if args.id_list != 'none':
            break

        print('Sleeping for {} seconds'.format(args.wait_time))
        time.sleep(args.wait_time + random.uniform(0, 3))

    if num_added_total > 0:
        print('Saving database with {n} papers to {p}'.format(n=len(db),
                                                              p=args.db_path))
        safe_pickle_dump(db, args.db_path)
    max_features=20000,
    norm="l2",
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False,
)

X = v.fit_transform(txts)
print v.vocabulary_
print X.shape

# write full matrix out
out = {}
out["X"] = X  # this one is heavy!
print ("writing tfidf.p")
utils.safe_pickle_dump(out, "tfidf.p")

# writing lighter metadata information into a separate (smaller) file
out = {}
out["vocab"] = v.vocabulary_
out["idf"] = v._tfidf.idf_
out["pids"] = pids  # a full idvv string (id and version number)
out["ptoi"] = {x: i for i, x in enumerate(pids)}  # pid to ix in X mapping
print ("writing tfidf_meta.p")
utils.safe_pickle_dump(out, "tfidf_meta.p")

print "precomputing nearest neighbor queries in batches..."
X = X.todense()  # originally it's a sparse matrix
sim_dict = {}
batch_size = 200
for i in xrange(0, len(pids), batch_size):
Beispiel #20
0
def analyze(csv_file, txt_dir):
    db = read_csv(csv_file)

    # read all text files for all papers into memory
    txt_paths, pids = [], []
    n = 0
    for idvv in db:
        n += 1
        # idvv = '%sv%d' % (j['_rawid'], j['_version'])
        # merged text_path
        txt_path = os.path.join(txt_dir, idvv + '.txt')
        if os.path.isfile(txt_path):  # some pdfs dont translate to txt
            with open(txt_path, 'r') as f:
                txt = f.read()
            if len(txt) > 1000 and len(
                    txt) < 500000:  # 500K is VERY conservative upper bound
                txt_paths.append(
                    txt_path
                )  # todo later: maybe filter or something some of them
                pids.append(idvv)
                print("read %d/%d (%s) with %d chars" %
                      (n, len(db), idvv, len(txt)))
            else:
                print("skipped %d/%d (%s) with %d chars: suspicious!" %
                      (n, len(db), idvv, len(txt)))
        else:
            print("could not find %s in txt folder." % (txt_path, ))
    print("in total read in %d text files out of %d db entries." %
          (len(txt_paths), len(db)))

    # compute tfidf vectors with scikits
    v = TfidfVectorizer(input='content',
                        encoding='utf-8',
                        decode_error='replace',
                        strip_accents='unicode',
                        lowercase=True,
                        analyzer='word',
                        stop_words='english',
                        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
                        ngram_range=(1, 2),
                        max_features=max_features,
                        norm='l2',
                        use_idf=True,
                        smooth_idf=True,
                        sublinear_tf=True,
                        max_df=1.0,
                        min_df=1)

    # create an iterator object to conserve memory
    def make_corpus(paths):
        for p in paths:
            with open(p, 'r') as f:
                txt = f.read()
            yield txt

    # train
    train_txt_paths = list(txt_paths)  # duplicate
    shuffle(train_txt_paths)  # shuffle
    train_txt_paths = train_txt_paths[:min(len(train_txt_paths),
                                           max_train)]  # crop
    print("training on %d documents..." % (len(train_txt_paths), ))
    train_corpus = make_corpus(train_txt_paths)
    v.fit(train_corpus)

    # transform
    print("transforming %d documents..." % (len(txt_paths), ))
    corpus = make_corpus(txt_paths)
    X = v.transform(corpus)
    print(v.vocabulary_)
    print(X.shape)

    # write full matrix out
    out = {}
    out['X'] = X  # this one is heavy!
    print("writing", Config.tfidf_path)
    safe_pickle_dump(out, Config.tfidf_path)

    # writing lighter metadata information into a separate (smaller) file
    out = {}
    out['vocab'] = v.vocabulary_
    out['idf'] = v._tfidf.idf_
    out['pids'] = pids  # a full idvv string (id and version number)
    out['ptoi'] = {x: i for i, x in enumerate(pids)}  # pid to ix in X mapping
    print("writing", Config.meta_path)
    safe_pickle_dump(out, Config.meta_path)

    print("precomputing nearest neighbor queries in batches...")
    X = X.todense()  # originally it's a sparse matrix
    sim_dict = {}
    batch_size = 200
    for i in range(0, len(pids), batch_size):
        i1 = min(len(pids), i + batch_size)
        xquery = X[i:i1]  # BxD
        ds = -np.asarray(np.dot(X, xquery.T))  #NxD * DxB => NxB
        IX = np.argsort(ds, axis=0)  # NxB
        for j in range(i1 - i):
            sim_dict[pids[i + j]] = [pids[q] for q in list(IX[:50, j])]
        print('%d/%d...' % (i, len(pids)))

    print("writing", Config.sim_path)
    # safe_pickle_dump(sim_dict, Config.sim_path)
    write_json(os.path.join(txt_dir, 'sim_dict.json'), sim_dict)
# oom killer was here
v.fit(train_corpus)

# transform
print("transforming %d documents..." % (len(txt_paths),))
corpus = make_corpus(txt_paths)
print("created corpus")
X = v.transform(corpus)
# print(v.vocabulary_)
print(X.shape)

# write full matrix out
out = {}
out["X"] = X  # this one is heavy!
print("writing", Config.tfidf_path)
safe_pickle_dump(out, Config.tfidf_path)

# writing lighter metadata information into a separate (smaller) file
out = {}
out["vocab"] = v.vocabulary_
out["idf"] = v._tfidf.idf_
out["pids"] = pids  # a full idvv string (id and version number)
out["ptoi"] = {x: i for i, x in enumerate(pids)}  # pid to ix in X mapping
print("writing", Config.meta_path)
safe_pickle_dump(out, Config.meta_path)

print("precomputing nearest neighbor queries in batches...")
X = X.todense()  # originally it's a sparse matrix
sim_dict = {}
batch_size = 100
for i in range(0, len(pids), batch_size):
Beispiel #22
0
def parse_xml(response):

    # lets load the existing database to memory
    try:
        db = pickle.load(open(Config.db_path, 'rb'))
    except Exception as e:
        print('error loading existing database:')
        print(e)
        print('starting from an empty database')
        db = {}

    print('database has %d entries at start' % (len(db), ))

    OAI = "{http://www.openarchives.org/OAI/2.0/}"
    ARXIV = "{http://arxiv.org/OAI/arXivRaw/}"

    parse = objectify.parse(response)

    num_added = 0
    num_skipped = 0

    root = parse.getroot()
    record_list = root.find(OAI + 'ListRecords').findall(OAI + "record")

    for record in record_list:
        info = record.metadata.find(ARXIV + "arXivRaw")

        versions = info.findall(ARXIV + "version")
        version_num = len(versions)

        published_version = info.find(ARXIV + "version[@version='v1']")
        latest_version = info.find(ARXIV + "version[@version='v" +
                                   str(version_num) + "']")

        published_parsed = dateutil.parser.parse(published_version.date.text)
        updated_parsed = dateutil.parser.parse(latest_version.date.text)
        published = published_parsed.strftime('%Y-%m-%d')
        updated = updated_parsed.strftime('%Y-%m-%d')

        authors = []
        author_list = info.authors.text.replace(', and ', ', ')
        author_list = info.authors.text.replace(' and ', ', ')
        author_list = author_list.split(', ')
        for author in author_list:
            authors.append({'name': author})

        cats = info.categories.text.split()
        primary_cat = {'term': cats[0]}
        tags = []
        for cat in cats:
            tags.append({'term': cat})

        rawid = info.id.text

        id_url = 'http://arxiv.org/abs/' + info.id.text

        if hasattr(info, 'doi'):
            doi = info.doi.text
        else:
            doi = ''

        if hasattr(info, 'journal-ref'):
            journal = info.find(ARXIV + 'journal-ref').text
        else:
            journal = ''

        if hasattr(info, 'comments'):
            comment = info.find(ARXIV + 'comments').text
        else:
            comment = ''

        links = [{
            'href':
            'http://arxiv.org/abs/' + rawid + 'v' + str(version_num),
            'rel':
            'alternate',
            'type':
            'text/html'
        }, {
            'href':
            'http://arxiv.org/pdf/' + rawid + 'v' + str(version_num),
            'rel':
            'related',
            'title':
            'pdf',
            'type':
            'application/pdf'
        }]

        j = {
            'published': published,
            'updated': updated,
            'updated_parsed': updated_parsed,
            'published_parsed': published_parsed,
            'authors': authors,
            'tags': tags,
            'arxiv_primary_category': primary_cat,
            'arxiv_doi': doi,
            'arxiv_journal_ref': journal,
            'id': id_url,
            'link': id_url,
            'links': links,
            '_rawid': rawid,
            '_version': version_num,
            'title': info.title.text,
            'summary': info.abstract.text,
            'arxiv_comment': comment,
        }

        # add to our database if we didn't have it before, or if this is a new version
        if not rawid in db or j['_version'] > db[rawid]['_version']:
            db[rawid] = j
            print('Updated %s added %s' %
                  (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
            num_added += 1
        else:
            num_skipped += 1

    # print some information
    print('Added %d papers, already had %d.' % (num_added, num_skipped))

    # save the database before we quit, if we found anything new
    print('Saving database with %d papers to %s' % (len(db), Config.db_path))
    safe_pickle_dump(db, Config.db_path)

    return
                num_skipped += 1

        #--------------------------抓取主循环结束,输出统计数据 Step 5--------------------------

        # print some information
        print('Added %d papers, already had %d.' % (num_added, num_skipped))

        #抓取受限时,能否考虑等待一定时间继续?
        '''
    if len(parse.entries) == 0:
      #原程序为直接终止
      print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
      print(response)
      break
     '''

        if num_added == 0 and args.break_on_no_added == 1:
            print(
                'No new papers were added. Assuming no new papers exist. Exiting.'
            )
            break

        print('Sleeping for %i seconds' % (args.wait_time, ))
        time.sleep(args.wait_time + random.uniform(0, 3))

    # save the database before we quit, if we found anything new
    if num_added_total > 0:
        print('Saving database with %d papers to %s' %
              (len(db), Config.db_path))
        safe_pickle_dump(db, Config.db_path)  #导出到外置存储
Beispiel #24
0
    txt_path = os.path.join('data', 'txt', idvv) + '.pdf.txt'
    if os.path.isfile(txt_path): # some pdfs dont translate to txt
        with open(txt_path, 'r') as f:
            txt = f.read()
            if len(txt) > 1000 and len(txt) < 500000: # 500K is VERY conservative upper bound
                txt_paths.append(txt_path) # todo later: maybe filter or something some of them
                pids.append(idvv)
                print("read %d/%d (%s) with %d chars" % (n, len(db), idvv, len(txt)))
                trainedpapers[pid] = j
            else:
                print("skipped %d/%d (%s) with %d chars: suspicious!" % (n, len(db), idvv, len(txt)))
    else:
        print("could not find %s in txt folder." % (txt_path, ))
print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db)))
print("writing ", Config.trained_path)
safe_pickle_dump(trainedpapers, Config.trained_path)

# compute tfidf vectors with scikits
vectorizer = TfidfVectorizer(input='content', 
        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
        lowercase=True, analyzer='word', stop_words='english', 
        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b', 
        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
        max_df=1.0, min_df=1)

# build an SVD model, n_components = 100 is chosen in random
svd_model = TruncatedSVD(n_components=100, 
                         algorithm='randomized',
                         n_iter=10, random_state=42)

# create an iterator object to conserve memory
def main():
    # parser = argparse.ArgumentParser()

    # ## Required parameters
    # parser.add_argument("--input_file", default=None, type=str, required=True)
    # parser.add_argument("--output_file", default=None, type=str, required=True)
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

    # ## Other parameters
    # parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    # parser.add_argument("--max_seq_length", default=128, type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
    #                         "than this will be truncated, and sequences shorter than this will be padded.")
    # parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help = "local_rank for distributed training on gpus")
    # parser.add_argument("--no_cuda",
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")

    # args = parser.parse_args()
    local_rank = -1
    # TODO: change this
    no_cuda = True
    # layers = "-1,-2,-3,-4"
    # layers = "-1,-2,-3,-4"
    bert_model = 'bert-base-uncased'
    do_lower_case = True
    max_seq_length = 150
    batch_size = 32
    feature_size = 768

    if local_rank == -1 or no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(local_rank != -1)))

    # layer_indexes = [int(x) for x in layers.split(",")]

    tokenizer = BertTokenizer.from_pretrained(bert_model,
                                              do_lower_case=do_lower_case)

    model = BertModel.from_pretrained(bert_model)
    model.to(device)

    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    examples = []  # 2100+ papers
    for pid, j in db.items():
        # idvv = '%sv%d' % (j['_rawid'], j['_version'])
        summary = j['summary'].replace('\n', ' ')
        examples += InputExample(pid, summary),

    features = convert_examples_to_features(examples=examples,
                                            seq_length=max_seq_length,
                                            tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=batch_size)

    model.eval()
    layer_index = -2
    bert_out = collections.OrderedDict()
    for input_ids, input_mask, example_indices in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers, _ = model(input_ids,
                                      token_type_ids=None,
                                      attention_mask=input_mask)

        for b, example_index in enumerate(example_indices):
            print("Example_index: ", example_index)
            feature = features[example_index.item()]
            unique_id = feature.unique_id
            feature = unique_id_to_feature[unique_id]
            output_json = collections.OrderedDict()
            output_json["linex_index"] = unique_id
            layer_output = all_encoder_layers[int(
                layer_index)].detach().cpu().numpy()
            layer_output = layer_output[b]  # 128*768
            # average pooling
            # sentence_embedding = np.mean(layer_output, 0)
            # sentence_embedding = [
            #     round(x.item(), 3) for x in sentence_embedding
            # ]
            print(unique_id, "bert out")
            bert_out[unique_id] = np.round(layer_output.flatten(), 3)
    safe_pickle_dump(bert_out, 'bert_out_big.p')
Beispiel #26
0
      # extract just the raw arxiv id and version for this paper
      rawid, version = parse_arxiv_url(j['id'])
      j['_rawid'] = rawid
      j['_version'] = version

      # add to our database if we didn't have it before, or if this is a new version
      if not rawid in db or j['_version'] > db[rawid]['_version']:
        db[rawid] = j
        print('updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
        num_added += 1
      else:
        num_skipped += 1

    # print some information
    print('Added %d papers, already had %d.' % (num_added, num_skipped))

    if len(parse.entries) == 0:
      print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
      print(response)
      break

    if num_added == 0 and args.break_on_no_added == 1:
      print('No new papers were added. Assuming no new papers exist. Exiting.')
      break

    print('Sleeping for %i seconds' % (args.wait_time , ))
    time.sleep(args.wait_time + random.uniform(0, 3))
  # save the database before we quit
  print('saving database with %d papers to %s' % (len(db), args.db_path))
  utils.safe_pickle_dump(db, args.db_path)
        for entry in map(encode_feedparser_dict, feed.entries):
            # extract just the raw arxiv id and version for this paper
            rawid, version = parse_arxiv_url(entry["id"])
            entry["_rawid"], entry["_version"] = rawid, version

            # add to our database if we didn't have it before, or if this is a new version
            if rawid not in paper_db or version > paper_db[rawid]["_version"]:
                paper_db[rawid] = entry
                print(f"Updated {entry['updated']} added {entry['title']}")
                num_added += 1
            else:
                num_skipped += 1

        # print some information
        print("Added %d papers, already had %d." % (num_added, num_skipped))
        num_added_total += num_added

        if num_added == 0 and args.break_on_no_added == 1:
            print("No new papers were added. Assuming "
                  "no new papers exist. Exiting.")
            break

        print(f"Sleeping for {args.wait_time} seconds")
        time.sleep(args.wait_time + random.uniform(0, 3))

    # save the database before we quit, if we found anything new
    if num_added_total > 0:
        print(
            f"Saving database with {len(paper_db)} papers to {Config.db_path}")
        safe_pickle_dump(paper_db, Config.db_path)
X = X.todense()

xtoi = { strip_version(x):i for x,i in meta['ptoi'].iteritems() }

user_sim = {}
for ii,u in enumerate(users):
  print '%d/%d building an SVM for %s' % (ii, len(users), u['username'].encode('utf-8'))
  uid = u['user_id']
  lib = query_db('''select * from library where user_id = ?''', [uid])
  pids = [x['paper_id'] for x in lib] # raw pids without version
  posix = [xtoi[p] for p in pids if p in xtoi]
  
  if not posix:
    continue # empty library for this user maybe?

  print pids
  y = np.zeros(X.shape[0])
  for ix in posix:
    y[ix] = 1

  #__init__(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source]
  clf = svm.LinearSVC(class_weight='auto', verbose=True, max_iter=10000, tol=1e-6, C=1)
  clf.fit(X,y)
  s = clf.decision_function(X)

  sortix = np.argsort(-s)
  user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)]

print 'writing user_sim.p'
utils.safe_pickle_dump(user_sim, "user_sim.p")
Beispiel #29
0
            j['_rawid'] = rawid
            j['_version'] = version

            # add to our database if we didn't have it before, or if this is a new version
            if not rawid in db or j['_version'] > db[rawid]['_version']:
                db[rawid] = j
                print 'updated %s added %s' % (j['updated'].encode('utf-8'),
                                               j['title'].encode('utf-8'))
                num_added += 1
            else:
                num_skipped += 1

        # print some information
        print 'Added %d papers, already had %d.' % (num_added, num_skipped)

        if len(parse.entries) == 0:
            print 'Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.'
            print response
            break

        if num_added == 0 and args.break_on_no_added == 1:
            print 'No new papers were added. Assuming no new papers exist. Exiting.'
            break

        print 'Sleeping for %i seconds' % (args.wait_time, )
        time.sleep(args.wait_time + random.uniform(0, 3))

    # save the database before we quit
    print 'saving database with %d papers to %s' % (len(db), args.db_path)
    utils.safe_pickle_dump(db, args.db_path)
      recompute_index = False
  if recompute_index:
    print 'building an index for faster search...'
    for pid in db:
      p = db[pid]
      dict_title = makedict(p['title'], forceidf=5, scale=3)
      dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5)
      dict_categories = {x['term'].lower():5 for x in p['tags']}
      if 'and' in dict_authors: 
        # special case for "and" handling in authors list
        del dict_authors['and']
      dict_summary = makedict(p['summary'])
      SEARCH_DICT[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary])
    # and cache it in file
    print 'writing search_dict.p as cache'
    utils.safe_pickle_dump(SEARCH_DICT, 'search_dict.p')
  else:
    print 'loading cached index for faster search...'
    SEARCH_DICT = pickle.load(open('search_dict.p', 'rb'))

  # start
  if args.prod:
    # run on Tornado instead, since running raw Flask in prod is not recommended
    print 'starting tornado!'
    from tornado.wsgi import WSGIContainer
    from tornado.httpserver import HTTPServer
    from tornado.ioloop import IOLoop
    from tornado.log import enable_pretty_logging
    enable_pretty_logging()
    http_server = HTTPServer(WSGIContainer(app))
    http_server.listen(args.port)
print(f"training on {n_train} documents...")

# duplicate, shuffle, split and train, then transform
train_txt_paths = list(txt_paths)
shuffle(train_txt_paths)
v.fit(train_txt_paths[:n_train])

print(f"transforming {len(txt_paths)} documents...")
X = v.transform(txt_paths)

print(v.vocabulary_)
print(X.shape)

# write full matrix out, this one is heavy!
print("writing", Config.tfidf_path)
safe_pickle_dump({"X": X}, Config.tfidf_path)

# writing lighter metadata information into a separate (smaller) file
print("writing", Config.meta_path)
safe_pickle_dump(
    {
        'vocab': v.vocabulary_,
        'idf': v._tfidf.idf_,
        'pids': pids,  # a full idvv string (id and version number)
        'ptoi': {x: i
                 for i, x in enumerate(pids)}  # pid to ix in X mapping
    },
    Config.meta_path)

print("precomputing nearest neighbor queries in batches...")
X = X.todense()  # originally it's a sparse matrix
def dump_rawid(rawid,dumpdic):
    db = pickle.load(open(Config.db_path, 'rb'))
    if not rawid in db or j['_version'] > db[rawid]['_version']:
    	db[rawid]=dumpdic
    	safe_pickle_dump(db, Config.db_path)
print("training on %d documents..." % (len(train_txt_paths), ))
train_corpus = make_corpus(train_txt_paths)
v.fit(train_corpus)

# transform
print("transforming %d documents..." % (len(txt_paths), ))
corpus = make_corpus(txt_paths)
X = v.transform(corpus)
pprint(v.vocabulary_)
print(X.shape)

# write full matrix out
out = {}
out['X'] = X  # this one is heavy!
print("writing", Config.tfidf_path)
safe_pickle_dump(out, Config.tfidf_path)

# writing lighter metadata information into a separate (smaller) file
out = {}
out['vocab'] = v.vocabulary_
out['idf'] = v._tfidf.idf_
out['pids'] = pids  # a full idvv string (id and version number)
out['ptoi'] = {x: i for i, x in enumerate(pids)}  # pid to ix in X mapping
print("writing", Config.meta_path)
safe_pickle_dump(out, Config.meta_path)

print("precomputing nearest neighbor queries in batches...")
X = X.todense()  # originally it's a sparse matrix
sim_dict = {}
batch_size = 200
for i in range(0, len(pids), batch_size):
        print('building an index for faster search...')
        for pid in db:
            p = db[pid]
            dict_title = makedict(p['title'], forceidf=5, scale=3)
            dict_authors = makedict(' '.join(x['name'] for x in p['authors']),
                                    forceidf=5)
            dict_categories = {x['term'].lower(): 5 for x in p['tags']}
            if 'and' in dict_authors:
                # special case for "and" handling in authors list
                del dict_authors['and']
            dict_summary = makedict(p['summary'])
            SEARCH_DICT[pid] = merge_dicts(
                [dict_title, dict_authors, dict_categories, dict_summary])
        # and cache it in file
        print('writing ', Config.search_dict_path, ' as cache...')
        safe_pickle_dump(SEARCH_DICT, Config.search_dict_path)
    else:
        print('loading cached index for faster search from',
              Config.search_dict_path)
        SEARCH_DICT = pickle.load(open(Config.search_dict_path, 'rb'))

    # start
    if args.prod:
        # run on Tornado instead, since running raw Flask in prod is not recommended
        print('starting tornado!')
        from tornado.wsgi import WSGIContainer
        from tornado.httpserver import HTTPServer
        from tornado.ioloop import IOLoop
        from tornado.log import enable_pretty_logging
        enable_pretty_logging()
        http_server = HTTPServer(WSGIContainer(app))
      # add to our database if we didn't have it before, or if this is a new version
      if not rawid in db or j['_version'] > db[rawid]['_version']:
        db[rawid] = j
        print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
        num_added += 1
        num_added_total += 1
      else:
        num_skipped += 1

    # print some information
    print('Added %d papers, already had %d.' % (num_added, num_skipped))

    if len(parse.entries) == 0:
      print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
      print(response)
      break

    if num_added == 0 and args.break_on_no_added == 1:
      print('No new papers were added. Assuming no new papers exist. Exiting.')
      break

    print('Sleeping for %i seconds' % (args.wait_time , ))
    time.sleep(args.wait_time + random.uniform(0, 3))

  # save the database before we quit, if we found anything new
  if num_added_total > 0:
    print('Saving database with %d papers to %s' % (len(db), Config.db_path))
    safe_pickle_dump(db, Config.db_path)

                  (n, len(db), idvv, len(txt)))
    else:
        print("could not find %s in txt folder." % (txt_path, ))
print("in total read in %d text files out of %d db entries." %
      (len(txt_paths), len(db)))

print("precomputing nearest neighbor queries in batches...")
# X = X.todense() # originally it's a sparse matrix
sim_dict = {}
# batch_size = 200
# for i in range(0,len(pids),batch_size):
#   i1 = min(len(pids), i+batch_size)
#   xquery = X[i:i1] # BxD
#   ds = -np.asarray(np.dot(X, xquery.T)) #NxD * DxB => NxB
#   IX = np.argsort(ds, axis=0) # NxB
#   for j in range(i1-i):
#     sim_dict[pids[i+j]] = [pids[q] for q in list(IX[:50,j])]
#   print('%d/%d...' % (i, len(pids)))

model = Doc2Vec.load("d2v.model")
for pid in pids:
    tmp = []
    try:
        tmp = model.docvecs.most_similar(pid)
    except:
        tmp = []
    sim_dict[pid] = [sim_pid for sim_pid, distance in tmp]

print("writing", Config.sim_path)
safe_pickle_dump(sim_dict, Config.sim_path)