def exportGroupCells(groups,outputdir,dsra2type): out=[] for i in range(len(groups)): icells=groups[i].cells for j in icells: ki=[j,"cluster:%s"%(i)] if j not in dsra2type else [j,"cluster:%s"%(i),dsra2type[j]] out.append(ki) BioList(out).ex2File("%s/GroupCells.txt"%(outputdir))
def exportSNP(dfsnp,groups,keptMutations,outputdir): #mutmat[getMutationDistribution(item,dfsnp,groups) for item in keptMutations] mutmat=[] for i in keptMutations: icells=dfsnp[i] imut=[] for g in groups: gcells=g.cells gmut=[1 if item in icells else 0 for item in gcells] imut+=gmut mutmat.append(imut) #------------------------------------------- fig,axs=plt.subplots(1,len(groups),sharey=True,gridspec_kw={'width_ratios':[len(item.cells) for item in groups]}) st=0 for i in range(len(groups)): ed=st+len(groups[i].cells) axs[i].imshow([item[st:ed] for item in mutmat],cmap="hot",aspect="auto",interpolation="nearest") st=ed for ax,l in zip(axs,range(len(groups))): ax.set_xticklabels([]) ax.set_xlabel(l) fig.add_subplot(111,frameon=False) plt.tick_params(labelcolor='none',top='off',bottom='off',left='off',right='off') plt.grid(False) plt.ylabel("SNPs") plt.xlabel("Clusters") plt.savefig("%s/SNP_matrix.png"%(outputdir),dpi=600) mutmatout=[] headers=['SNP']+reduce(lambda x,y:x+y,[item.cells for item in groups]) mutmatout.append(headers) for i in range(len(keptMutations)): mutmat[i]=[keptMutations[i]]+mutmat[i] mutmatout.append(mutmat[i]) BioList(mutmatout).ex2File("%s/SNP_matrix.tsv"%(outputdir))
def drawTree(net,outstr,outputdir): pos=networkx.spring_layout(net) #Xlabels={net.nodes.keys()[k]:net.nodes.keys()[k].name.split("|")[0] for k in range(len(net.nodes))} Xlabels={} for k in range(len(net.nodes)): nkeys=list(net.nodes.keys()) Xlabels[nkeys[k]]=nkeys[k].name.split("|")[0] XlabelTexts=[item.name.split("|") for item in Xlabels] Xnames=[item[0] for item in XlabelTexts] XlabelTexts=[item[-1] if len(item)>1 else "" for item in XlabelTexts] networkx.draw(net,pos,labels=Xlabels,with_labels=True) datgraph=[] for i in range(len(pos.values())): [x,y]=list(pos.values())[i] datgraph.append([Xnames[i],(x,y)]) plt.text(x-0.003*len(XlabelTexts[i]),y+0.05,s=XlabelTexts[i],fontsize=5) plt.subplots_adjust(right=0.5) plt.savefig("%s/%s.png"%(outputdir,outstr),bbox_inches="tight",dpi=600) BioList(datgraph).ex2File("%s/%s.dat"%(outputdir,outstr))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input_genes", required=True, help="the input gene list for the SARS-CoV-2 association analysis") parser.add_argument( "-v", "--viral_host_interactions", required=True, help="The interactions between viral proteins and host proteins") parser.add_argument("-p", "--host_protein_interactions", required=True, help="The interaction between host proteins") parser.add_argument("-o", "--output", required=True, help="The specified output directory") parser.add_argument( "-b", "--background", required=False, default=1000, help= "Integer, the number of background proteins/genes used to calculate the connectivity significance of input genes" ) args = parser.parse_args() fnCOVID19 = args.viral_host_interactions fnPPI = args.host_protein_interactions fnMarker = args.input_genes output = args.output try: bgN = int(args.background) except: print("-b must be integer, please check your input!") sys.exit(0) #---------------------------- # read in covid19 interactions print("processing input files ...") COVInteractions = TabFile(fnCOVID19).read("\t")[1:] SProteins = list(set([item[1] for item in COVInteractions])) source = list(set([item.split(" ")[0] for item in SProteins])) #pdb.set_trace() dCI = {} for i in COVInteractions: dCI[i[1].split(" ")[0] + "," + i[3]] = [i[1], i[3]] COVInteractions = [[item[1].split(" ")[0], item[3], item[-2]] for item in COVInteractions] COVProteins = [] for i in COVInteractions: COVProteins += i[:2] COVProteins = list(set(COVProteins)) # read in PPIs PPI = TabFile(fnPPI).read("\t")[1:] PPI = [[item[0], item[2], item[3]] for item in PPI] # read in markers markers = LineFile(fnMarker).read() markers = [item for item in markers if item != ''] print("building protein networks ...") AllN = getNodes(COVInteractions) + getNodes(PPI) + markers AllNodes = [] for i in AllN: if i not in AllNodes: AllNodes.append(i) #AllNodes=list(set(AllN)) AEdges = getEdges(COVInteractions + PPI) AllEdges = {} for i in AEdges: iKey = i iScore = AEdges[i] if iScore > 0: iScore = math.log(1.0 / iScore, 2) AllEdges[iKey] = iScore G = nx.Graph() for i in AllNodes: G.add_node(i) for i in AllEdges: [A, B] = i.split(',') si = AllEdges[i] G.add_edge(A, B, weight=si) print("learning optimal path from SARS-CoV-2 to input genes ...") targets = markers SPaths = [] AllProteins = [] AllUProteins = [] for i in source: for j in targets: if nx.has_path(G, i, j): pij_all = nx.all_shortest_paths(G, i, j) sij_all = [] for pijk in pij_all: sijk = getScore(G, pijk) sij_all.append([sijk, pijk]) sij_all = sorted(sij_all, key=lambda x: x[0]) [sij, pij] = sij_all[0] AllProteins += pij uij = dCI[pij[0] + ',' + pij[1]] + pij[2:] AllUProteins += uij SPaths.append([i, j, sij, uij]) print(j) AllProteins = list(set(AllProteins)) outEdges = extractEdges(AllProteins, AEdges) network = [] for i in outEdges: [A, B] = i.split(",") iScore = outEdges[i] if i in dCI: [A, B] = dCI[i] ni = [A, iScore, B] network.append(ni) NodeInfo = [['Node', 'Type']] for i in AllUProteins: if i in SProteins: NodeInfo.append([i, 'virus']) elif i in COVProteins: NodeInfo.append([i, 'source']) elif i in targets: NodeInfo.append([i, 'target']) else: NodeInfo.append([i, 'Intermediate']) print( "finding the optimal paths from SARS-CoV-2 proteins to all host proteins" ) APaths = [] ct = 0 # fix the random initialization random.seed(a=10) bgNodes = random.sample(AllNodes[1:], bgN) for i in source: for j in bgNodes: if nx.has_path(G, i, j): pij_all = nx.all_shortest_paths(G, i, j) sij_all = [] for pijk in pij_all: sijk = getScore(G, pijk) sij_all.append([sijk, pijk]) sij_all = sorted(sij_all, key=lambda x: x[0]) [sij, pij] = sij_all[0] APaths.append([i, j, sij, pij]) ct += 1 print(ct) SPath = [float(item[2]) for item in SPaths] APath = [float(item[2]) for item in APaths] pv1 = mannwhitneyu(SPath, APath, alternative='less') print("p-value: %s" % (pv1[1])) XX = [SPath, APath] #df=pd.DataFrame(data=XX) #df.index=['HLH Genes','All Genes'] if os.path.exists(output) == False: os.mkdir(output) print("writing results ...") print("exporting inferred paths ...") BioList(SPaths).ex2File("%s/SPaths.txt" % (output), "\t") BioList(APaths).ex2File("%s/APaths.txt" % (output), "\t") print("exporting network file (.sif) ...") BioList(network).ex2File("%s/network.sif" % (output), '\t') print("exporting network node attribute file (.txt)") BioList(NodeInfo).ex2File("%s/NodeInfo.txt" % (output), '\t') sns.boxplot(data=XX) plt.xticks(range(len(XX)), ['Input Genes', 'All Genes']) plt.ylabel("Connectivity Score") plt.savefig("%s/Connectivity.pdf" % (output))