def myopen(path, *args) : path = get_absolute_path(path) if not file_exists(path) and file_exists(path + '.gz') : path = path + '.gz' if path[-3:] == '.gz' : import gzip return gzip.open(path, *args) elif path[-4:] == '.bgz' : import bgzf # Make sure sys.path has appropriate dir return bgzf.open(path, *args) else : return open(path, *args)
def read_header(self): self.fh = bgzf.open(self.filepath) magic = struct.unpack('i', self.fh.read(4))[0] l_text = struct.unpack('i', self.fh.read(4))[0] self.header_text = self.fh.read(l_text) n_ref = struct.unpack('i', self.fh.read(4))[0] for tid in range(n_ref): l_name = struct.unpack('i', self.fh.read(4))[0] name = self.fh.read(l_name) name = name[:-1] l_ref = struct.unpack('i', self.fh.read(4))[0] self.reference_table[name] = AlignmentReference(tid, name, l_ref) self.reference_names.append(name)
def loadbgzip(filepath, estimated=1362493): entries = [] with bgzf.open(filepath, "rb") as fd: pbar = tqdm(total=estimated) while True: pbar.update(1) sizeBuffer = fd.read(8 * 1) if (len(sizeBuffer) == 8 * 1): dataSize = struct.unpack("<Q", sizeBuffer)[0] data = fd.read(dataSize) entry = ujson.loads(data.decode("utf8")) entries.append(entry) else: break return entries
def processTitleYearJournal(bgzPapersFilePath,papersDataTypes): estimatedCount = 233745561; index2magID = [] index2Title = [] index2DOI = [] index2Year = [] index2OnlineDate = [] index2JournalID = [] count = 0; with bgzf.open(bgzPapersFilePath,"rb") as infd: pbar = tqdm(total=estimatedCount); while True: pbar.update(1); lengthData = infd.read(8); if(len(lengthData)==8): length = struct.unpack("<q",lengthData)[0]; else: break; data = infd.read(length); paperIndex = struct.unpack("<q",data[0:8])[0]; currentPointer = 8; # paperID = struct.unpack("<q",data[8:16])[0]; dataDict = {}; for (typeIndex,(typeName,typeType)) in enumerate(papersDataTypes): if(typeType==int): entryValue, = struct.unpack("<q",data[currentPointer:currentPointer+8]); currentPointer+=8; dataDict[typeName] = entryValue; else: entryLength, = struct.unpack("<q",data[currentPointer:currentPointer+8]); currentPointer+=8; entryData = data[currentPointer:currentPointer+entryLength].decode("utf8"); currentPointer+=entryLength; dataDict[typeName] = entryData; index2Title.append(dataDict["PaperTitle"]) index2Year.append(dataDict["Year"]) index2magID.append(dataDict["PaperId"]) index2DOI.append(dataDict["Doi"]) index2OnlineDate.append(dataDict["OnlineDate"]) index2JournalID.append(dataDict["JournalId"]) count+=1; # if(count>100000): # break; return index2Title,index2magID,index2Year,index2OnlineDate,index2JournalID,index2DOI;
# setOfJournalsWithPublications = set() HCACitationsPerYear = {} HBMapCitationsPerYear = {} HCAHuBMapCitationsPerYear = {} HGPCitationsPerYear = {} projectCitations = { "HCA":(HCAProjectMAGIndices,HCACitationsPerYear), "HuBMAP":(HBMapProjectMAGIndices,HBMapCitationsPerYear), "HuBMAP+HCA":(HBMapProjectMAGIndices.union(HCAProjectMAGIndices),HCAHuBMapCitationsPerYear), "HGP":(HGPProjectMAGIndices,HGPCitationsPerYear) } MAGID2CitationYear = {} with bgzf.open(bgzReferencesFromToPath,"rb") as infd: pbar = tqdm(total=estimatedCount) count = 0 while True: pbar.update(1) lengthData = infd.read(8) if(len(lengthData)==8): length = struct.unpack("<q",lengthData)[0] else: break data = infd.read(length) edgesCount = length/8-1 fmt = "<%dq" % (edgesCount+1) edgeData = list(struct.unpack(fmt,data)) fromIndex = edgeData[0] fromID = index2magID[fromIndex]
def savebgzip(filepath, entries): with bgzf.open(filepath, "wb") as fd: for entry in tqdm(entries): data = ujson.dumps(entry).encode("utf8") fd.write(struct.pack("<Q", len(data))) fd.write(data)
np.savetxt(PJ(dataPath, "MAGJournalPaperCounts.txt.gz"), journalPaperCounts, "%d") np.savetxt(PJ(dataPath, "MAGJournalISSN.txt.gz"), journalISSN, "%s") print("Total With ISSN: %.0f%%" % (totalISSNCount / totalBigCount * 100)) print("Final journals Count: %d" % len(index2JournalID)) print("\n Reading and Saving Paper 2 Journal Information") estimatedCount = 242387326 # Used just to display status paper2journalYear = {} paper2journalYearBGZPath = PJ(dataPath, "MAGPaper2JournalYear.bgz") if (not rebuildAll and os.path.exists(paper2journalYearBGZPath)): print("Reading from BGZ file...") pbar = tqdm(total=estimatedCount) with bgzf.open(paper2journalYearBGZPath, "rb") as fd: while True: pbar.update(1) data = fd.read(8 * 3) if (len(data) == 8 * 3): paperID, journal, year = struct.unpack("<QQQ", data) paper2journalYear[paperID] = (journal, year) else: break else: print("Reading from RAW file...") with open(PJ(MAGPath, "Papers.txt"), "rt") as fd: pbar = tqdm(total=estimatedCount) for line in fd: pbar.update(1) entries = line.split("\t")