def import_date(fp_date, db): """Import date distribution into MongoDB""" # assume that 'date' collection is small enough to put in memory with open(fp_date) as fin: old_key = None # tracking doc_id tfdict = defaultdict(float) # date term frequency dictionary for each document for line in fin: if line: this_key, date, tf = line.strip('\n').split('\t') if this_key != old_key and old_key: # to successfully update, use unicode db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}}) tfdict = defaultdict(float) old_key = this_key # update date tf tfdict[date2daterange(int(date))] += float(tf) # dont forget last doc db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}}) print "Finish importing date distributions."
def import2mongo(filepath): client = MongoClient('localhost', 27017) db = client.HTRC collections = db.collection_names() for c in ['tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'tf_ocr']: if c in collections: print "Collection %s already exists in 'HTRC' database. Drop it." % c db.drop_collection(c) count = 0 # use for bulk insert without using up memory dfdict_uni, dfdict_bi, dfdict_tri = defaultdict(float), defaultdict(float), defaultdict(float) # document frequencies of terms tf_uni, tf_bi, tf_tri = [], [], [] # list of documents chardoclist = [] with codecs.open(filepath, encoding='utf8') as fin: old_key = None # tracking doc_id tfdict_uni, tfdict_bi, tfdict_tri = {}, {}, {} # term frequency dictionary for each document chardict = defaultdict(float) for line in fin: if line: this_key, term, tf = line.split('\t') term = term.replace('.', '-').replace('$', '-') # BSON doesn't allow '.' and '$' if this_key != old_key and old_key: tf_uni.append({"_id":old_key, "freq":tfdict_uni, "prob":freq2prob(tfdict_uni)}) tf_bi.append({"_id":old_key, "freq":tfdict_bi, "prob":freq2prob(tfdict_bi)}) tf_tri.append({"_id":old_key, "freq":tfdict_tri, "prob":freq2prob(tfdict_tri)}) chardoclist.append({"_id":old_key, "freq":chardict, "prob":freq2prob(chardict)}) tfdict_uni, tfdict_bi, tfdict_tri = {}, {}, {} chardict = defaultdict(float) count += 1 if count > 2000: # 2000 docs as a batch db.tf_1.insert(tf_uni) db.tf_2.insert(tf_bi) db.tf_3.insert(tf_tri) db.tf_ocr.insert(chardoclist) # clear memory & count tf_uni, tf_bi, tf_tri = [], [], [] chardoclist = [] count = 0 old_key = this_key # update tf & df if term.count(' ') == 0: tfdict_uni[term] = float(tf) dfdict_uni[term] += 1 for char in term: chardict[char] += float(tf) elif term.count(' ') == 1: tfdict_bi[term] = float(tf) dfdict_bi[term] += 1 elif term.count(' ') == 2: tfdict_tri[term] = float(tf) dfdict_tri[term] += 1 # dont forget last doc tf_uni.append({"_id":old_key, "freq":tfdict_uni, "prob":freq2prob(tfdict_uni)}) tf_bi.append({"_id":old_key, "freq":tfdict_bi, "prob":freq2prob(tfdict_bi)}) tf_tri.append({"_id":old_key, "freq":tfdict_tri, "prob":freq2prob(tfdict_tri)}) chardoclist.append({"_id":old_key, "freq":chardict, "prob":freq2prob(chardict)}) # insert regardless of count db.tf_1.insert(tf_uni) db.tf_2.insert(tf_bi) db.tf_3.insert(tf_tri) db.tf_ocr.insert(chardoclist) # save df (document frequencies) to collections ('df_1','df_2','df_3') db.df_1.insert(reshape(dfdict_uni)) db.df_2.insert(reshape(dfdict_bi)) db.df_3.insert(reshape(dfdict_tri))