def myopen(path, *args) :
    path = get_absolute_path(path)
    if not file_exists(path) and file_exists(path + '.gz') :
        path = path + '.gz'
    if path[-3:] == '.gz' :
        import gzip
        return gzip.open(path, *args)
    elif path[-4:] == '.bgz' :
        import bgzf # Make sure sys.path has appropriate dir
        return bgzf.open(path, *args)
    else :
        return open(path, *args)
Beispiel #2
0
 def read_header(self):
     self.fh = bgzf.open(self.filepath)
     magic = struct.unpack('i', self.fh.read(4))[0]
     l_text = struct.unpack('i', self.fh.read(4))[0]
     self.header_text = self.fh.read(l_text)
     n_ref = struct.unpack('i', self.fh.read(4))[0]
     for tid in range(n_ref):
         l_name = struct.unpack('i', self.fh.read(4))[0]
         name = self.fh.read(l_name)
         name = name[:-1]
         l_ref = struct.unpack('i', self.fh.read(4))[0]
         self.reference_table[name] = AlignmentReference(tid, name, l_ref)
         self.reference_names.append(name)
Beispiel #3
0
def loadbgzip(filepath, estimated=1362493):
    entries = []
    with bgzf.open(filepath, "rb") as fd:
        pbar = tqdm(total=estimated)
        while True:
            pbar.update(1)
            sizeBuffer = fd.read(8 * 1)
            if (len(sizeBuffer) == 8 * 1):
                dataSize = struct.unpack("<Q", sizeBuffer)[0]
                data = fd.read(dataSize)
                entry = ujson.loads(data.decode("utf8"))
                entries.append(entry)
            else:
                break
    return entries
Beispiel #4
0
def processTitleYearJournal(bgzPapersFilePath,papersDataTypes):
    estimatedCount = 233745561;
    index2magID = []
    index2Title = []
    index2DOI = []
    index2Year = []
    index2OnlineDate = []
    index2JournalID = []
    count = 0;
    with bgzf.open(bgzPapersFilePath,"rb") as infd:
        pbar = tqdm(total=estimatedCount);
        while True:
            pbar.update(1);
            lengthData = infd.read(8);
            if(len(lengthData)==8):
                length = struct.unpack("<q",lengthData)[0];
            else:
                break;
            data = infd.read(length);
            paperIndex = struct.unpack("<q",data[0:8])[0];
            currentPointer = 8;
    #         paperID = struct.unpack("<q",data[8:16])[0];
            dataDict = {};
            for (typeIndex,(typeName,typeType)) in enumerate(papersDataTypes):
                if(typeType==int):
                    entryValue, = struct.unpack("<q",data[currentPointer:currentPointer+8]);
                    currentPointer+=8;
                    dataDict[typeName] = entryValue;
                else:
                    entryLength, = struct.unpack("<q",data[currentPointer:currentPointer+8]);
                    currentPointer+=8;
                    entryData = data[currentPointer:currentPointer+entryLength].decode("utf8");
                    currentPointer+=entryLength;
                    dataDict[typeName] = entryData;
            index2Title.append(dataDict["PaperTitle"])
            index2Year.append(dataDict["Year"])
            index2magID.append(dataDict["PaperId"])
            index2DOI.append(dataDict["Doi"])
            index2OnlineDate.append(dataDict["OnlineDate"])
            index2JournalID.append(dataDict["JournalId"])
            count+=1;
#             if(count>100000):
#                 break;
    return index2Title,index2magID,index2Year,index2OnlineDate,index2JournalID,index2DOI;
Beispiel #5
0
# setOfJournalsWithPublications = set()
HCACitationsPerYear = {}
HBMapCitationsPerYear = {}
HCAHuBMapCitationsPerYear = {}
HGPCitationsPerYear = {}


projectCitations = {
    "HCA":(HCAProjectMAGIndices,HCACitationsPerYear),
    "HuBMAP":(HBMapProjectMAGIndices,HBMapCitationsPerYear),
    "HuBMAP+HCA":(HBMapProjectMAGIndices.union(HCAProjectMAGIndices),HCAHuBMapCitationsPerYear),
    "HGP":(HGPProjectMAGIndices,HGPCitationsPerYear)
}

MAGID2CitationYear = {}
with bgzf.open(bgzReferencesFromToPath,"rb") as infd:
    pbar = tqdm(total=estimatedCount)
    count = 0
    while True:
        pbar.update(1)
        lengthData = infd.read(8)
        if(len(lengthData)==8):
            length = struct.unpack("<q",lengthData)[0]
        else:
            break
        data = infd.read(length)
        edgesCount = length/8-1
        fmt = "<%dq" % (edgesCount+1)
        edgeData  = list(struct.unpack(fmt,data))
        fromIndex = edgeData[0]
        fromID = index2magID[fromIndex]
Beispiel #6
0
def savebgzip(filepath, entries):
    with bgzf.open(filepath, "wb") as fd:
        for entry in tqdm(entries):
            data = ujson.dumps(entry).encode("utf8")
            fd.write(struct.pack("<Q", len(data)))
            fd.write(data)
Beispiel #7
0
np.savetxt(PJ(dataPath, "MAGJournalPaperCounts.txt.gz"), journalPaperCounts,
           "%d")
np.savetxt(PJ(dataPath, "MAGJournalISSN.txt.gz"), journalISSN, "%s")

print("Total With ISSN: %.0f%%" % (totalISSNCount / totalBigCount * 100))
print("Final journals Count: %d" % len(index2JournalID))

print("\n Reading and Saving Paper 2 Journal Information")
estimatedCount = 242387326
# Used just to display status
paper2journalYear = {}
paper2journalYearBGZPath = PJ(dataPath, "MAGPaper2JournalYear.bgz")
if (not rebuildAll and os.path.exists(paper2journalYearBGZPath)):
    print("Reading from BGZ file...")
    pbar = tqdm(total=estimatedCount)
    with bgzf.open(paper2journalYearBGZPath, "rb") as fd:
        while True:
            pbar.update(1)
            data = fd.read(8 * 3)
            if (len(data) == 8 * 3):
                paperID, journal, year = struct.unpack("<QQQ", data)
                paper2journalYear[paperID] = (journal, year)
            else:
                break
else:
    print("Reading from RAW file...")
    with open(PJ(MAGPath, "Papers.txt"), "rt") as fd:
        pbar = tqdm(total=estimatedCount)
        for line in fd:
            pbar.update(1)
            entries = line.split("\t")