def generate_corpus_file(pmids, medline_dir, fn):
    print "generating corpus file"
    my_medline = medline.medline(medline_dir)
    fh = open(fn, "w")
    print >> fh, "\t".join(["## pmid", "stop_stem"])
    for i, (pmid, abstract) in enumerate(my_medline.get_processed_abstracts(pmids)):
        if i%1000000==0:
            print i
        print >> fh, "\t".join([pmid, abstract])
    fh.close()
def determine_stats(medline_dir):
    # determine stats for medline files
    my_medline = medline.medline(medline_dir)
    fh_m = open("../data/medline_mesh.txt","w")
    print >> fh_m, "\t".join(["## pmid", "major", "key", "minors"])
    for i, (pmid, d_info) in  enumerate(my_medline._read_pmids(log_fn = "../data/medline_stats2.txt")):
        meshs = d_info["source"].get("MH", [])
        for mesh in meshs:
            mymesh = medline.mesh(mesh)
            print >> fh_m, "\t".join([pmid, mymesh.major, str(mymesh.key)]+mymesh.minors)
    fh_m.close()