Esempio n. 1
0
def main():
    if (len(sys.argv) != 3):  # if no input
        print "ERORR"
        print "syntax: python generate_chemaxon_fingerprints.py smiles1 outputprefix"
        return

    smilesfile1 = sys.argv[1]
    outfileprefix = sys.argv[2]

    outfileF = outfileprefix + '.fp'

    pid = str(os.getpid(
    ))  # get the process idenifier so that we do not right over the same file.

    # read in names from smiles file.
    names = []
    file1 = open(smilesfile1, 'r')
    for line in file1:
        print line
        name = line.split()[1]
        #print line, name
        names.append(name)
    file1.close()

    # if the fp file already exists, just read in the fingerprints.
    # otherwise caluclate the fingerprints.
    if (os.path.isfile(outfileF)):
        print outfileF + " exists."
        exit()
    else:
        fpvec1 = tccalc.get_fp(smilesfile1, outfileF, pid)
def main():
  if not (len(sys.argv) == 4 or len(sys.argv) == 5): # if no input
     print "ERORR"
     print "syntexs: python tanimoto_cal_axon.py -one smiles1 outputprefix"
     print "         this produces a squere symestric matrix of set1 with itself. "
     print "syntexs: python tanimoto_cal_axon.py -two smiles1 smiles2 outputprefix"
     print "         this produces a rectangular non-symestric matrix of set1 to set2"
     return

  pid = str(os.getpid()) # get the process idenifier so that we do not right over the same file. 
  print pid

  oneortwo    = sys.argv[1]
  smilesfile1 = sys.argv[2]
  if oneortwo == "-one":
    outfileprefix = sys.argv[3]
  elif  oneortwo == "-two":
    smilesfile2 = sys.argv[3]
    outfileprefix = sys.argv[4]
  else:
      print "the frist parameter must be -one or -two."
      exit()

  outfile1 = outfileprefix +'.1.fp'
  fpvec1 = tccalc.get_fp(smilesfile1,outfile1,pid)
  if oneortwo == "-one":
    fpvec2 = fpvec1
  if oneortwo == "-two":
    outfile2 = outfileprefix +'.2.fp'
    fpvec2 = tccalc.get_fp(smilesfile2,outfile2,pid)

  outfileM = outfileprefix +'.tanimoto.matrix'

  #print len(fpvec2)
  #print len(fpvec1)
  #exit(0)

  file1 = open(outfileM,'w')
  for fp1 in fpvec1:
     flag_frist = True 
     for fp2 in fpvec2:
        #print fp1
        #print fp2
        TC = tccalc.tanimoto(fp1,fp2)
        if (flag_frist):
           flag_frist = False
        else:
           file1.write(',')
        file1.write('%f' % TC )
     file1.write('\n' )
  file1.close()

  alpha = 0.2
  beta  = 0.2
  outfileM = outfileprefix +'.tversky.'+str(alpha)+'.'+str(beta)+'.matrix'
  file1 = open(outfileM,'w')
  for fp1 in fpvec1:
     flag_frist = True
     for fp2 in fpvec2:
        TV = tccalc.tversky_index(fp1,fp2,alpha,beta)
        if (flag_frist):
           flag_frist = False
        else:
           file1.write(',')
        file1.write('%f' % TV )
     file1.write('\n' )
  file1.close()
def main():
  if (len(sys.argv) != 4): # if no input
     print "ERORR"
     print "syntexs: python best_frist_tc_clusters.py smiles1 outputprefix threshold"
     return

  smilesfile1   = sys.argv[1]
  outfileprefix = sys.argv[2]
  tc_threshold  = float(sys.argv[3])

  outfileF = outfileprefix +'.fp'

  pid = str(os.getpid()) # get the process idenifier so that we do not right over the same file. 

  # read in names from smiles file.  
  names = []
  file1 = open(smilesfile1,'r')
  for line in file1:
      name = line.split()[1]  
      #print line, name
      names.append(name)
  file1.close()

  
  # if the fp file already exists, just read in the fingerprints.  
  # otherwise caluclate the fingerprints.
  if (os.path.isfile(outfileF) ):
     print outfileF + "exists."
     fpvec1 = []
     file1 = open(outfileF)
     for line in file1:
         fp = line.split()[2]
         #print fp
         fpvec1.append(fp)
  else:
     fpvec1 = tccalc.get_fp(smilesfile1,outfileF,pid)

  outfileC = outfileprefix +'.clusters'

  # initialize cluster array
  clusters = []
  tcs      = []
  i = 0
  while(i < len(fpvec1)):
       print i
       clusters.append(0)
       tcs.append(1.0)
       i = i+1


  # best frist clustering. 
  #tc_thres = 0.4
  tc_thres = tc_threshold
  cluster_num = 1
  for i,fp1 in enumerate(fpvec1):
     print i 
     if (clusters[i] !=0):
        continue # skip if the molecule is all ready asigned to a cluster.
     clusters[i] = cluster_num # asign a cluster number to the next best molecule
     for j,fp2 in enumerate(fpvec1):
        if (clusters[j] !=0):
           continue # skip if the molecule is all ready asigned to a cluster.
        TC = tccalc.tanimoto(fp1,fp2)
        #print TC
        if i == j: #skip it if they are the same
           continue
        if (TC > tc_thres):
           print cluster_num, i,j
           clusters[j] = cluster_num
           tcs[j]      = TC
     cluster_num = cluster_num + 1

  file1 = open(outfileC,'w')
  for i in range(len(clusters)):
      file1.write('%s,%d,%f\n' % (names[i],clusters[i],tcs[i]) )
  file1.close()
def main():
    if (len(sys.argv) != 4):  # if no input
        print "ERORR"
        print "syntexs: python best_frist_tc_clusters.py smiles1 outputprefix threshold"
        return

    smilesfile1 = sys.argv[1]
    outfileprefix = sys.argv[2]
    tc_threshold = float(sys.argv[3])

    outfileF = outfileprefix + '.fp'

    pid = str(os.getpid(
    ))  # get the process idenifier so that we do not right over the same file.

    # read in names from smiles file.
    names = []
    file1 = open(smilesfile1, 'r')
    for line in file1:
        name = line.split()[1]
        #print line, name
        names.append(name)
    file1.close()

    # if the fp file already exists, just read in the fingerprints.
    # otherwise caluclate the fingerprints.
    if (os.path.isfile(outfileF)):
        print outfileF + "exists."
        fpvec1 = []
        file1 = open(outfileF)
        for line in file1:
            fp = line.split()[2]
            #print fp
            fpvec1.append(fp)
    else:
        fpvec1 = tccalc.get_fp(smilesfile1, outfileF, pid)

    # make dictionary of footprints.
    fp_dic = {}
    for i in range(len(fpvec1)):
        name = names[i]
        fp = fpvec1[i]
        fp_dic[name] = fp

    # best frist clustering.
    # -- pick the top ranked molecule (frist in the list),
    # -- write out everything close to that molecule,
    # -- append everything else to a new list.
    # -- replace the old list with the new list
    name_list = names
    name_newlist = []
    cluster = 1
    tc_thres = tc_threshold
    while (len(name_list) > 0):  # loop until the list is empty
        print "on cluster %d" % cluster
        outclusterfile = outfileprefix + ".cluster." + str(cluster)
        file1 = open(outclusterfile,
                     'w')  # open up the file for current cluster.
        fp1 = fp_dic[name_list[0]]  # pick the top molecule
        for i, name in enumerate(name_list):
            fp2 = fp_dic[name]
            TC = tccalc.tanimoto(fp1, fp2)
            if (TC > tc_thres):
                print i
                print name_list[0] + ',' + name + ":" + str(TC)
                file1.write(
                    '%s,%f\n' % (name, TC)
                )  # write out everything close to that molecule, in the current cluster file.
            else:
                name_newlist.append(
                    name)  # append everything else to a new list
        file1.close()  # close the file for the current cluster.
        name_list = name_newlist  # replace the old list with the new list
        name_newlist = []
        cluster = cluster + 1  # increment the cluster name.
def main():
    if (len(sys.argv) != 4):  # if no input
        print "ERORR"
        print "syntexs: python best_frist_tc_clusters.py smiles1 outputprefix threshold"
        return

    smilesfile1 = sys.argv[1]
    outfileprefix = sys.argv[2]
    tc_threshold = float(sys.argv[3])

    outfileF = outfileprefix + '.fp'

    pid = str(os.getpid(
    ))  # get the process idenifier so that we do not right over the same file.

    # read in names from smiles file.
    names = []
    file1 = open(smilesfile1, 'r')
    for line in file1:
        name = line.split()[1]
        #print line, name
        names.append(name)
    file1.close()

    # if the fp file already exists, just read in the fingerprints.
    # otherwise caluclate the fingerprints.
    if (os.path.isfile(outfileF)):
        print outfileF + "exists."
        fpvec1 = []
        file1 = open(outfileF)
        for line in file1:
            fp = line.split()[2]
            #print fp
            fpvec1.append(fp)
    else:
        fpvec1 = tccalc.get_fp(smilesfile1, outfileF, pid)

    # make dictionary of footprints and 1-tc values.
    fp_dic = {}
    tc_dic = {}
    for i in range(len(fpvec1)):
        name = names[i]
        fp = fpvec1[i]
        fp_dic[name] = fp
        tc_dic[name] = -1.0

    # best frist clustering.
    # -- pick the top ranked molecule (frist in the list),
    # -- write out everything close to that molecule,
    # -- append everything else to a new list.
    # -- replace the old list with the new list
    name_list = names
    name_newlist = []
    cluster = 1
    tc_thres = tc_threshold
    first_pass = True
    while (len(name_list) > 0):  # loop until the list is empty
        outclusterfile = outfileprefix + ".cluster." + str(cluster)
        file1 = open(outclusterfile,
                     'w')  # open up the file for current cluster.
        fp1 = fp_dic[name_list[0]]  # pick the top molecule
        onentc_1_2 = 1.0 - tc_dic[name_list[0]]  # this is one minus the tc.
        print "on cluster %d - 1-tc=%f" % (cluster, onentc_1_2)
        for i, name in enumerate(name_list):
            fp2 = fp_dic[name]
            onentc_1_3 = 1.0 - tc_dic[name]
            if ((not first_pass) and
                (math.fabs(onentc_1_2 - onentc_1_3) >
                 (1 - tc_thres))):  # by minipulating the triangle inequality.
                print i, "inequality, not in cluster", onentc_1_2, onentc_1_3
                name_newlist.append(name)  # not put in cluster
            elif ((not first_pass) and (onentc_1_2 + onentc_1_3 <=
                                        (1 - tc_thres))):
                # this should never be true here, because 2, and 3 would be already in cluster 1, if it were true, which is a contradiction.
                print i, "inequality, in cluster", onentc_1_2, onentc_1_3
                TC = 1 - (onentc_1_2 + onentc_1_3)
                print name_list[0] + ',' + name + ": <= " + str(TC)
                file1.write(
                    '%s,<=%f\n' % (name, TC)
                )  # write out everything close to that molecule, in the current cluster file.
            else:
                TC = tccalc.tanimoto(fp1, fp2)
                if (TC > tc_thres):
                    print i
                    print name_list[0] + ',' + name + ":" + str(TC)
                    file1.write(
                        '%s,%f\n' % (name, TC)
                    )  # write out everything close to that molecule, in the current cluster file.
                else:
                    if (first_pass):
                        tc_dic[name] = TC
                    name_newlist.append(
                        name)  # append everything else to a new list
        first_pass = False
        file1.close()  # close the file for the current cluster.
        name_list = name_newlist  # replace the old list with the new list
        name_newlist = []
        cluster = cluster + 1  # increment the cluster name.