Python Transcriptome.update_expression Beispiele

Programmiersprache: Python

Namespace / Paketname: TranscriptomeBasics

Klasse / Typ: Transcriptome

Methode / Funktion: update_expression

Beispiele auf hotexamples.com: 2

Python Transcriptome.update_expression - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die TranscriptomeBasics.Transcriptome.update_expression, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Transcriptome(2)

add_expression(2)

add_genepred_line(2)

set_reference_genome_dictionary(2)

add_expression_no_update(1)

ref_hash(1)

update_expression(1)

Beispiel #1

Datei anzeigen

Datei: simulate_biallelic_transcriptome.py Projekt: songjiajia2018/Manual-for-running-IDP-pipeline

def load_from_inputs(args):
    #Read in the VCF file
    sys.stderr.write("Reading in the VCF file\n")
    alleles = {}
    #with open(args.phased_VCF) as inf:
    with open(args.inputs[1]) as inf:
        for line in inf:
            vcf = VCF(line)
            if not vcf.is_snp(): continue
            g = vcf.get_phased_genotype()
            if not g: continue
            if vcf.value('chrom') not in alleles:
                alleles[vcf.value('chrom')] = {}
            if vcf.value('pos') in alleles[vcf.value('chrom')]:
                sys.stderr.write("WARNING: seeing the same position twice.\n" +
                                 line.rstrip() + "\n")
            alleles[vcf.value('chrom')][vcf.value(
                'pos')] = g  # set our left and right

    sys.stderr.write("Reading in the reference genome\n")
    #ref = read_fasta_into_hash(args.reference_genome)
    ref = read_fasta_into_hash(args.inputs[0])
    res1 = []
    res2 = []
    p = None
    sys.stderr.write("Introducing VCF changes to reference sequences\n")
    # Pretty memory intesnive to so don't go with all possible threads
    if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4)))
    for chrom in ref:
        # handle the case where there is no allele information
        if chrom not in alleles:
            r1q = Queue()
            r1q.put([0, chrom, ref[chrom]])
            res1.append(r1q)
            r2q = Queue()
            r2q.put([0, chrom, ref[chrom]])
            res2.append(r2q)
        elif args.threads > 1:
            res1.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 0, chrom)))
            res2.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 1, chrom)))
        else:
            r1q = Queue()
            r1q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom))
            res1.append(r1q)
            r2q = Queue()
            r2q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom))
            res2.append(r2q)
    if args.threads > 1:
        p.close()
        p.join()

    # now we can fill reference 1 with all our new sequences
    ref1 = {}
    c1 = 0
    for i in range(0, len(res1)):
        res = res1[i].get()
        c1 += res[0]
        ref1[res[1]] = res[2]

    # now we can fill reference 2 with all our new sequences
    ref2 = {}
    c2 = 0
    for i in range(0, len(res2)):
        res = res2[i].get()
        c2 += res[0]
        ref2[res[1]] = res[2]
    sys.stderr.write("Made " + str(c1) + "|" + str(c2) +
                     " changes to the reference\n")

    # Now ref1 and ref2 have are the diploid sources of the transcriptome
    gpdnames = {}
    txn1 = Transcriptome()
    txn2 = Transcriptome()
    txn1.set_reference_genome_dictionary(ref1)
    txn2.set_reference_genome_dictionary(ref2)
    #with open(args.transcripts_genepred) as inf:
    with open(args.inputs[2]) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn1.add_genepred_line(line.rstrip())
            txn2.add_genepred_line(line.rstrip())
            gpd = GenePredEntry(line.rstrip())
            gpdnames[gpd.value('name')] = gpd.value('gene_name')
    # The transcriptomes are set but we dont' really need the references anymore
    # Empty our big memory things
    txn1.ref_hash = None
    txn2.ref_hash = None
    for chrom in ref1.keys():
        del ref1[chrom]
    for chrom in ref2.keys():
        del ref2[chrom]
    for chrom in ref.keys():
        del ref[chrom]

    if not args.locus_by_gene_name:
        #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
        [locus2name, name2locus] = get_loci(args.inputs[2])
    else:  # set locus by gene name
        sys.stderr.write("Organizing loci by gene name\n")
        locus2name = {}
        name2locus = {}
        numname = {}
        m = 0
        for name in sorted(gpdnames):
            gene = gpdnames[name]
            if gene not in numname:
                m += 1
                numname[gene] = m
            num = numname[gene]
            if num not in locus2name:
                locus2name[num] = set()
            locus2name[num].add(name)
            name2locus[name] = num
        sys.stderr.write("Ended with " + str(len(locus2name.keys())) +
                         " loci\n")

    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn1.add_expression(f[0], float(f[1]))
                txn2.add_expression(f[0], float(f[1]))
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        cuffz = 0
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                cuffz += 1
                sys.stderr.write(str(cuffz) + " cufflinks entries processed\r")
                f = line.rstrip().split("\t")
                txn1.add_expression_no_update(f[0], float(f[9]))
                txn2.add_expression_no_update(f[0], float(f[9]))
        txn1.update_expression()
        txn2.update_expression()
        sys.stderr.write("\n")
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    else:
        sys.stderr.write(
            "Warning isoform expression not sepcified, using uniform expression model.\n"
        )
    # Now we have the transcriptomes set
    rhos = {}  # The ASE of allele 1 (the left side)
    randos = {}
    if args.seed:
        random.seed(args.seed)
    for z in locus2name:
        randos[z] = random.random()
    sys.stderr.write("Setting rho for each transcript\n")
    # Lets set rho for ASE for each transcript
    for tname in sorted(txn1.transcripts):
        if args.ASE_identical or args.ASE_identical == 0:
            rhos[tname] = float(args.ASE_identical)
        elif args.ASE_isoform_random:
            rhos[tname] = random.random()
        else:  # we must be on locus random
            rhos[tname] = randos[name2locus[tname]]
    #Now our dataset is set up
    rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2)
    rbe.gene_names = gpdnames
    rbe.name2locus = name2locus
    rbe.set_transcriptome1_rho(rhos)
    return rbe

Beispiel #2

Datei anzeigen

Datei: simulate_biallelic_transcriptome.py Projekt: jason-weirather/Au-public

def load_from_inputs(args):
  #Read in the VCF file
  sys.stderr.write("Reading in the VCF file\n")
  alleles = {}
  #with open(args.phased_VCF) as inf:
  with open(args.inputs[1]) as inf:
    for line in inf:
      vcf = VCF(line)
      if not vcf.is_snp(): continue
      g = vcf.get_phased_genotype()
      if not g: continue
      if vcf.value('chrom') not in alleles:
        alleles[vcf.value('chrom')] = {}
      if vcf.value('pos') in alleles[vcf.value('chrom')]:
        sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n")
      alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right

  sys.stderr.write("Reading in the reference genome\n")
  #ref = read_fasta_into_hash(args.reference_genome)
  ref = read_fasta_into_hash(args.inputs[0])
  res1 = []
  res2 = []
  p = None
  sys.stderr.write("Introducing VCF changes to reference sequences\n")
  # Pretty memory intesnive to so don't go with all possible threads
  if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4)))
  for chrom in ref:
    # handle the case where there is no allele information
    if chrom not in alleles:
      r1q = Queue()
      r1q.put([0,chrom,ref[chrom]])
      res1.append(r1q)
      r2q = Queue()
      r2q.put([0,chrom,ref[chrom]])
      res2.append(r2q)
    elif args.threads > 1:
      res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom)))
      res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom)))
    else:
      r1q = Queue()
      r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom))
      res1.append(r1q)
      r2q = Queue()
      r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom))
      res2.append(r2q)
  if args.threads > 1:
    p.close()
    p.join()

  # now we can fill reference 1 with all our new sequences
  ref1 = {} 
  c1 = 0
  for i in range(0,len(res1)):
    res = res1[i].get()
    c1 += res[0]
    ref1[res[1]]=res[2]

  # now we can fill reference 2 with all our new sequences
  ref2 = {} 
  c2 = 0
  for i in range(0,len(res2)):
    res = res2[i].get()
    c2 += res[0]
    ref2[res[1]]=res[2]
  sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n")

  # Now ref1 and ref2 have are the diploid sources of the transcriptome
  gpdnames = {}
  txn1 = Transcriptome()
  txn2 = Transcriptome()
  txn1.set_reference_genome_dictionary(ref1)
  txn2.set_reference_genome_dictionary(ref2)
  #with open(args.transcripts_genepred) as inf:
  with open(args.inputs[2]) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn1.add_genepred_line(line.rstrip())
      txn2.add_genepred_line(line.rstrip())
      gpd = GenePredEntry(line.rstrip())
      gpdnames[gpd.value('name')] = gpd.value('gene_name')
  # The transcriptomes are set but we dont' really need the references anymore
  # Empty our big memory things
  txn1.ref_hash = None
  txn2.ref_hash = None
  for chrom in ref1.keys():  del ref1[chrom]
  for chrom in ref2.keys():  del ref2[chrom]
  for chrom in ref.keys():  del ref[chrom]

  if not args.locus_by_gene_name:
    #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
    [locus2name,name2locus] = get_loci(args.inputs[2])
  else: # set locus by gene name
    sys.stderr.write("Organizing loci by gene name\n")
    locus2name = {}
    name2locus = {}
    numname = {}
    m = 0
    for name in sorted(gpdnames): 
      gene = gpdnames[name]
      if gene not in numname:
        m+=1
        numname[gene] = m
      num = numname[gene]
      if num not in locus2name:
        locus2name[num] = set()
      locus2name[num].add(name)
      name2locus[name] = num
    sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n")

  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn1.add_expression(f[0],float(f[1]))
        txn2.add_expression(f[0],float(f[1]))
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    cuffz = 0
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        cuffz +=1
        sys.stderr.write(str(cuffz)+" cufflinks entries processed\r")
        f = line.rstrip().split("\t")
        txn1.add_expression_no_update(f[0],float(f[9]))
        txn2.add_expression_no_update(f[0],float(f[9]))
    txn1.update_expression()
    txn2.update_expression()
    sys.stderr.write("\n")
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  else:
    sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n")
  # Now we have the transcriptomes set
  rhos = {} # The ASE of allele 1 (the left side)
  randos = {}
  if args.seed:
    random.seed(args.seed)
  for z in locus2name: randos[z] = random.random()
  sys.stderr.write("Setting rho for each transcript\n")
  # Lets set rho for ASE for each transcript
  for tname in sorted(txn1.transcripts):
    if args.ASE_identical or args.ASE_identical == 0:
      rhos[tname] = float(args.ASE_identical)
    elif args.ASE_isoform_random:
      rhos[tname] = random.random()
    else: # we must be on locus random
      rhos[tname] = randos[name2locus[tname]]
  #Now our dataset is set up
  rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2)
  rbe.gene_names = gpdnames
  rbe.name2locus = name2locus
  rbe.set_transcriptome1_rho(rhos)
  return rbe