Exemple #1
0
def main():
    print(
        "\n*******************************************************************************"
    )
    print(
        "* GCMStoolbox - a set of tools for GC-MS data analysis                        *"
    )
    print("*   Version: {} ({})                                             *".
          format(gcmstoolbox.version, gcmstoolbox.date))
    print(
        "*   Author:  Wim Fremout, Royal Institute for Cultural Heritage               *"
    )
    print(
        "*   Licence: GNU GPL version 3                                                *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "* IMPORT:                                                                     *"
    )
    print(
        "*   import one or more AMDIS (.elu, .msl, .csl, .isl) and NIST MS SEARCH      *"
    )
    print(
        "*   (.msp) files and store the mass spectra in GCMStoolbox JSON format        *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "*******************************************************************************\n"
    )

    ### OPTIONPARSER

    usage = "usage: %prog [options] IMPORTFILE1 [IMPORTFILE2 [...]]"

    parser = OptionParser(usage,
                          version="GCMStoolbox version " +
                          gcmstoolbox.version + " (" + gcmstoolbox.date +
                          ")\n")
    parser.add_option("-v",
                      "--verbose",
                      help="Be very verbose [not default]",
                      action="store_true",
                      dest="verbose",
                      default=False)
    parser.add_option("-o",
                      "--jsonout",
                      help="JSON output file name [default: gcmstoolbox.json]",
                      action="store",
                      dest="jsonout",
                      type="string",
                      default="gcmstoolbox.json")
    parser.add_option("-a",
                      "--append",
                      help="Append to existing json file [not default]",
                      action="store_true",
                      dest="append",
                      default=False)

    group = OptionGroup(parser, "IMPORT OPTIONS",
                        "Special formatting options for the ELinC project")
    group.add_option(
        "-s",
        "--specno",
        help=
        "Override spectrum numbering, start with I [default: 1]; the append option may override this",
        action="store",
        dest="i",
        default=1,
        type="int")
    group.add_option(
        "-n",
        "--norm",
        help=
        "Normalise to a given maximum, 0 to skip normalisation [default=999])",
        action="store",
        dest="n",
        default=999,
        type="int")
    group.add_option(
        "--allmodels",
        help="For AMDIS .ELU files: import all models [not default]",
        action="store_true",
        dest="allmodels",
        default=False)
    parser.add_option_group(group)

    group = OptionGroup(parser, "ELinC",
                        "Special formatting options for the ELinC project")
    group.add_option(
        "-e",
        "--elinc",
        help="Retrieve parameters from the structured file names [not default]",
        action="store_true",
        dest="elinc",
        default=False)
    parser.add_option_group(group)

    (options, args) = parser.parse_args()

    ### ARGUMENTS AND OPTIONS

    cmd = " ".join(sys.argv)

    if options.verbose: print("Processing import files and options")

    # make a list of input files
    inFiles = []
    if len(args) == 0:
        print(" !! No import files?\n")
        exit()
    else:
        for arg in args:
            inFiles.extend(glob(arg))
    inFiles = list(set(inFiles))  #remove duplicates
    for inFile in inFiles:
        if os.path.isdir(inFile):
            inFiles.remove(inFile)  #remove directories
        else:
            if options.verbose: print(" - import file: " + inFile)

    # number of inFiles; must not be 0
    numInFiles = len(inFiles)
    if numInFiles == 0:
        print(" !! No import files?\n")
        exit()
    else:
        if options.verbose: print(" => " + str(numInFiles) + " import files")

    if options.verbose:
        print(" => JSON output file: " + options.jsonout +
              (" [append]" if options.append else ""))

    if options.append:
        data = gcmstoolbox.openJSON(options.jsonout)

        # check if it is a spectra file (cannot append to groups file)
        if data['info']['mode'] != "spectra":
            print(" !! Cannot append to a '" + data['info']['mode'] +
                  "' mode data file.\n")
            exit()

        # add administration to specta[0] (info)
        data['info']['cmds'].append(cmd)
        data['info']['sources'].extend(inFiles)

        # spectrum number counter  (remark: len(spectra) is always one count higher than the number of spectra; spectra[0] is info!)
        if len(data['spectra']) < options.i:
            i = options.i
        else:
            i = len(data['spectra']) + 1
    else:
        cmds = [cmd]
        data = OrderedDict()
        data['info'] = OrderedDict([('mode', 'spectra'), ('cmds', cmds)])
        data['spectra'] = OrderedDict()
        i = options.i  # spectrum number

    if options.elinc and options.verbose:
        print(" => ELinC special formatting is set")

    ### ITERATE THROUGH INFILES

    # init progress bar
    if not options.verbose:
        print("\nProcessing files")
        j = 0
        k = len(inFiles)
        gcmstoolbox.printProgress(j, k)

    for inFile in inFiles:
        if options.verbose: print("\nProcessing file: " + inFile)

        with open(inFile, 'r') as fh:  #file handle closes itself
            lastSpectrum = False
            while True:
                # read spectra
                inFile = os.path.basename(inFile)
                spectrum = readspectrum(fh,
                                        inFile,
                                        norm=options.n,
                                        elinc=options.elinc,
                                        verbose=options.verbose)

                # break from while loop if readspectrum returns False (<= EOF)
                if spectrum == "eof":
                    break

                # apply special ELinC formatting
                if options.elinc:
                    elincize(spectrum, inFile, verbose=options.verbose)

                # store only the Amdis model with the lowest OR (except if options.allmodels command line option is active)
                if not options.allmodels and ('OR'
                                              in spectrum) and ('RI'
                                                                in spectrum):
                    # check if the previous spectrum in the the ELU file is another model for the same scan (same RI, other OR)
                    if lastSpectrum:
                        if spectrum['RI'] == data['spectra'][lastSpectrum][
                                'RI']:
                            # if the new spectrum has higher OR than the stored spectrum, skip this one
                            if spectrum['OR'] >= data['spectra'][lastSpectrum][
                                    'OR']:
                                if options.verbose:
                                    print(
                                        "    - Skipping: a more likely model is already stored"
                                    )
                                continue
                            else:
                                if options.verbose:
                                    print(
                                        "    - Replacing an already stored less likely model"
                                    )
                                # it's a bit messy, but in order to overwrite a spectrum we need to
                                del data['spectra'][
                                    lastSpectrum]  # (1) remove the old
                                i -= 1  # (2) reduce the iterator

                # write spectrum
                spectrum['DB#'] = str(i)
                key = spectrum.pop('Name')
                key = 'S{} {}'.format(i, key)
                key = key[:77]  # longer spectrum names cause problems in AMDIS
                data['spectra'][key] = spectrum

                # keep track of the previous spectrum in case of ELU models for the same peak
                lastSpectrum = key

                # increase spectrum number
                i += 1

        # adjust progress bar
        if not options.verbose:
            j += 1
            gcmstoolbox.printProgress(j, k)

    ### WRITE SPECTRA JSON

    print("\nWriting data file")
    gcmstoolbox.saveJSON(data, options.jsonout)

    print(" => Finalised. Wrote " + options.jsonout + "\n")
    exit()
Exemple #2
0
def main():
    print(
        "\n*******************************************************************************"
    )
    print(
        "* GCMStoolbox - a set of tools for GC-MS data analysis                        *"
    )
    print("*   Version: {} ({})                                             *".
          format(gcmstoolbox.version, gcmstoolbox.date))
    print(
        "*   Author:  Wim Fremout, Royal Institute for Cultural Heritage               *"
    )
    print(
        "*   Licence: GNU GPL version 3                                                *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "* GROUP:                                                                      *"
    )
    print(
        "*   Search groups in a NIST search of a large dataset against itself          *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "*******************************************************************************\n"
    )

    ### OPTIONPARSER

    usage = "usage: %prog [options] MSPEPSEARCH_FILE"

    parser = OptionParser(usage,
                          version="GCMStoolbox version " +
                          gcmstoolbox.version + " (" + gcmstoolbox.date +
                          ")\n")
    parser.add_option("-v",
                      "--verbose",
                      help="Be very verbose",
                      action="store_true",
                      dest="verbose",
                      default=False)
    parser.add_option("-i",
                      "--jsonin",
                      help="JSON input file name [default: gcmstoolbox.json]",
                      action="store",
                      dest="jsonin",
                      type="string",
                      default="gcmstoolbox.json")
    parser.add_option(
        "-o",
        "--jsonout",
        help="JSON output file name [default: same as JSON input file]",
        action="store",
        dest="jsonout",
        type="string")

    group = OptionGroup(
        parser, "RETENTION INDEX GROUPING CRITERIUM",
        "Only select matching mass spectra that have a retention index matching an RI window around the RI of the unknown spectrum.\n[RIwindow] = [RIfixed] + [RIfactor] * RI\nNote: if both RIfixed and RIfactor are zero, no retention based grouping will be applied."
    )
    group.add_option("-r",
                     "--rifixed",
                     help="Apply an RI window with fixed term. [default: 0]",
                     action="store",
                     dest="rifixed",
                     type="float",
                     default=0)
    group.add_option(
        "-R",
        "--rifactor",
        help="Apply an RI window with RI-dependent factor [default: 0]",
        action="store",
        dest="rifactor",
        type="float",
        default=0)
    group.add_option("-D",
                     "--discard",
                     help="Discard hits without RI",
                     action="store_true",
                     dest="discard",
                     default=False)
    parser.add_option_group(group)

    group = OptionGroup(
        parser, "NIST MS SEARCH GROUPING CRITERIUM",
        "(Reverse) match settings are set in and calculated by MSPEPSEARCH. However, the options below can be used to set a minimal MF and/or RMF for the grouping process."
    )
    group.add_option("-m",
                     "--match",
                     help="Apply NIST MS match limit [default: 0]",
                     action="store",
                     dest="minmf",
                     type="int",
                     default=0)
    group.add_option("-n",
                     "--reverse",
                     help="Apply NIST MS reverse match limit [default: 0]",
                     action="store",
                     dest="minrmf",
                     type="int",
                     default=0)
    parser.add_option_group(group)

    group = OptionGroup(
        parser, "AMBIGUOUS MATCHES",
        "Sometimes a spectrum is matched against a series of spectra that are allocated to two or more different groups. By default, these groups are not merged."
    )
    group.add_option("-M",
                     "--merge",
                     help="Merge groups with ambiguous matches",
                     action="store_true",
                     dest="merge",
                     default=False)
    parser.add_option_group(group)

    (options, args) = parser.parse_args()

    ### ARGUMENTS AND OPTIONS

    global data, allocations, doubles, j, k

    cmd = " ".join(sys.argv)

    if options.verbose: print("Processing arguments")

    # input file
    if len(args) == 0:
        print(" !! No MSPEPSEARCH file?\n")
        exit()
    elif len(args) >= 2:
        print(
            "  !! Too many arguments. Only one MSPEPSEARCH file can be processed."
        )
        exit()
    elif os.path.isfile(args[0]):
        inFile = args[0]
    else:
        print("  !! MSPEPSEARCH file " + args[0] + " not found.")
        exit()

    # check and read JSON input file
    data = gcmstoolbox.openJSON(options.jsonin)

    # json output
    if options.jsonout == None:
        options.jsonout = options.jsonin

    if options.verbose:
        print(" => JSON input file:  " + options.jsonin)
        print(" => JSON output file: " + options.jsonout + "\n")

    ### GROUP

    # init progress bar
    print("\nProcessing file: " + inFile)
    k = len(data['spectra'])
    if not options.verbose:
        j = 0
        gcmstoolbox.printProgress(j, k)

    # open MSPEPSEARCH file, read and interpret it line by line
    i = 1
    with open(inFile, 'r') as fh:
        for line in fh:
            for z in range(k):
                if line.casefold().startswith('unknown'):
                    line, i = readlist(fh, line, i, options.rifixed,
                                       options.rifactor, options.discard,
                                       options.minmf, options.minrmf,
                                       options.merge, options.verbose)

                    # update progress bar
                    if not options.verbose:
                        j += 1
                        gcmstoolbox.printProgress(j, k)

                    if line == "eof": break

    ### BUILD GROUPS

    print("\nGrouping spectra ...")
    data['groups'] = OrderedDict()

    # init progress bar
    if not options.verbose:
        j = 0
        k = len(data['spectra'])
        gcmstoolbox.printProgress(j, k)

    for s, g in allocations.items():
        g = "G" + str(g)
        buildgroups(data['groups'], g, s)

        # adjust progress bar
        if not options.verbose:
            j += 1
            gcmstoolbox.printProgress(j, k)

    del allocations

    ### STATS

    stats = OrderedDict()
    stats["spectra"] = len(data['spectra'])
    stats["groups"] = len(data['groups'])
    if options.merge: stats["merged"] = [sorted(d) for d in doubles.values()]
    else: stats["ambiguous"] = [sorted(d) for d in doubles.values()]
    stats["stats"] = groupstats(data['groups'])

    print("\nSTATISTICS")
    print("  - Number of mass spectra: " + str(stats["spectra"]))
    print("  - Number of groups:       " + str(stats["groups"]))
    if not options.merge:
        print("  - Groups that may be the same component:")
        for key in sorted(doubles.keys()):
            print("      - " + ", ".join(str(d) for d in sorted(doubles[key])))
    print("  - Number of hits per group:")

    if options.verbose:
        lines = groupstats(data['groups'], options.verbose)
    else:
        lines = stats["stats"]
    for l in lines:
        print("      - " + l)

    ### UPDATE JSON FILE

    if options.verbose:
        print("\nUpdate JSON output file: " + options.jsonout + "\n")
    data["info"]["mode"] = "group"
    data["info"]["grouping"] = stats
    data["info"]["cmds"].append(cmd)
    gcmstoolbox.saveJSON(data, options.jsonout)  # backup and safe json
    print("\nFinalised. Wrote " + options.jsonout + "\n")

    exit()
Exemple #3
0
def main():
  print("\n*******************************************************************************")
  print(  "* GCMStoolbox - a set of tools for GC-MS data analysis                        *")
  print(  "*   Version: {} ({})                                             *".format(gcmstoolbox.version, gcmstoolbox.date))
  print(  "*   Author:  Wim Fremout, Royal Institute for Cultural Heritage               *")
  print(  "*   Licence: GNU GPL version 3                                                *")
  print(  "*                                                                             *")
  print(  "* FILTER                                                                      *")
  print(  "*   Reduces the groups json file based on a number of filtering options       *") 
  print(  "*                                                                             *")
  print(  "*******************************************************************************\n")

  ### OPTIONPARSER
  
  usage = "\n\nCommands:\n"
  usage += "  list    Overview of defined filters\n"
  usage += "           --> usage: %prog list [options]\n"
  usage += "  on      Enable filter\n"
  usage += "           --> usage: %prog on [options] FILTER_NUMBERS\n"
  usage += "  off     Disable filter\n"
  usage += "           --> usage: %prog off [options] FILTER_NUMBERS\n"
  usage += "  make    Define a new filter\n"
  usage += "           --> usage: %prog make [options]"
  
  parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n")
  parser.add_option("-v", "--verbose",    help="Be very verbose",  action="store_true", dest="verbose", default=False)
  parser.add_option("-i", "--jsonin",  help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json")
  parser.add_option("-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string")
  
  group = OptionGroup(parser, "MAKE: Filter out groups based on group number")
  group.add_option("-g", "--group",       help="Group number [default: 0], multiple possible", action="append", dest="group", type="string")
  parser.add_option_group(group)
  
  group = OptionGroup(parser, "MAKE: Filter out groups on the number of spectra in a group")
  group.add_option("-c", "--count",      help="Minimal number of spectra per group", action="store", dest="count", type="int")
  group.add_option("-C",                 help="Don't count multiple spectra from the same source", action="store_true", dest="sourcecount", default=False)
  parser.add_option_group(group)
  
  group = OptionGroup(parser, "MAKE: Filter out groups based on the presence of a chosen m/z")
  group.add_option("-m", "--mass",       help="m/z value, multiple possible", action="append", dest="mass", type="int")
  group.add_option("-M", "--percent",    help="Minimal relative intensity of a m/z value [default: 90]", action="store", dest="percent", type="int", default=90)
  group.add_option("-s", "--sum",        help="Calculate sumspectra with the N spectra with highest signal, 0 for all [default: 0]", action="store",  dest="n", type="int", default=0)
  parser.add_option_group(group)
  
  (options, args) = parser.parse_args()
  
  
  ### ARGUMENTS AND OPTIONS
  
  cmd = " ".join(sys.argv)

  if options.verbose: print("Processing arguments...")
  
  # check and read JSON input file
  data = gcmstoolbox.openJSON(options.jsonin)
  if data['info']['mode'] == 'spectra':
    print("\n!! Cannot filter on ungrouped spectra.")
    exit()
  
  # json output 
  if options.jsonout == None: 
    options.jsonout = options.jsonin

  if options.verbose:
    print(" => JSON input file:  " + options.jsonin)
    print(" => JSON output file: " + options.jsonout + "\n")
    
  # command and arguments
  if len(args) == 0:
    print(" !! No command given\n")
    exit()
  elif args[0].lower().startswith("l"):
    if len(args) > 1:
      print(" !! The list command does not support arguments\n")
      exit()
    else: #LIST
      for id, it in data['filters'].items():
        print(id + ": filters out " + str(len(it['out'])) + " groups [" + ("Enabled" if it['active'] else "Disabled") + "]")
        if 'crit1' in it: print("  - remove groups: " + it['crit1'])
        if 'crit2' in it: print("  - remove on spectrum count: " + it['crit2'])
        if 'crit3' in it: print("  - remove on m/z values: " + it['crit3'])
        print('')
      exit()
  elif (args[0].lower() == 'on') or (args[0].lower() == 'off'):
    flist = [x.upper() for x in args]
    act = True if (flist.pop(0) == 'ON') else False
    safe = False
    for f in flist:
      if not f.startswith("F"): f = "F" + f
      if f in data['filters']:
        data['filters'][f]['active'] = act
        print(('Enabled ' if act else 'Disabled ') + f)
        safe = True
    if safe:
      data["info"]["cmds"].append(cmd)
      gcmstoolbox.saveJSON(data, options.jsonout)     # backup and safe json
      print(" => Updated " + options.jsonout + "\n")
    else:
      print(" !! Invalid filter names\n")
    exit()
  elif args[0].lower().startswith("m"):
    if len(args) > 1:
      print(" !! The list command does not support arguments\n")
      exit()
    # else: proceed
  else:
    print(" !! Invalid command given\n")
    exit()
    
  #criterium flags
  c1 = False if options.group is None else True  #CRITERIUM1: group numbers to be removed
  c2 = False if options.count is None else True  #CRITERIUM2: minimal spectrum count per group 
  c3 = False if options.mass  is None else True  #CRITERIUM3: minimal intensity of choses m/z values

  if not (c1 or c2 or c3):
    print("\n!! No criteria selected. Nothing to do.")
    exit()

    
  ### INITIALISE

  candidates = set(data["groups"].keys())
  # candidates for removal; each criterium will remove those groups that should be kept
  # since we iterate through a set that will be smaller after each criterium, we'll do the
  # most time-consuming criteria last
  
  
  ### CRITERIUM 1: GROUP NUMBER
  if c1:
    removegroups = []
    for g in options.group:
      g = str(g).upper()
      if not g.startswith('G'): g = "G" + g
      removegroups.append(g)
    
    print("\nCRITERIUM 1: remove groups by group numbers: " + ", ".join(removegroups))
    if not options.verbose: 
      i = 0
      j = len(candidates)
      gcmstoolbox.printProgress(i, j)
      
    for c in list(candidates):   # iterate over a copy of the set, so we can remove things from the original while iterating
      if c not in removegroups:
        candidates.discard(c)
        
      # progress bar
      if not options.verbose: 
        i += 1
        gcmstoolbox.printProgress(i, j)
  
    if options.verbose: 
      print("candidates for removal:")
      if len(candidates) == 0:
        print("  none")
      else:
        print(tabulate(candidates))

  
  ### CRITERIUM 2: SPECTRUM COUNT
  if c2:
    print("\nCRITERIUM 2: remove groups with less than " + str(options.count) + " spectra...")
    if not options.verbose: 
      i = 0
      j = len(candidates)
      gcmstoolbox.printProgress(i, j)
      
    for c in list(candidates):   # iterate over a copy of the set, so we can remove things from the original while iterating
      if not options.sourcecount:
        # count number of spectra
        if data["groups"][c]["count"] >= options.count:  #remove from candidates = keep group
          candidates.discard(c)
      else:
        # count number of sources
        spset = set()
        nosource = 0   # also count spectra without source
        for s in data["groups"][c]["spectra"]:
          if "Source" in data["spectra"][s]:
            spset.add(data["spectra"][s]["Source"])
          else:
            nosource += 1
        if (len(spset) + nosource) >= options.count:  #remove from candidates = keep group
          candidates.discard(c)
        
      # progress bar
      if not options.verbose: 
        i += 1
        gcmstoolbox.printProgress(i, j)
  
    if options.verbose: 
      print("candidates for removal:")
      if len(candidates) == 0:
        print("  none")
      else:
        print(tabulate(candidates))


  ### CRITERIUM 3: RUBBISH PEAK SEARCH
  if c3:
    print("\nCRITERIUM 3: remove groups with m/z value " + ", ".join(str(m) for m in options.mass))
    if not options.verbose: 
      i = 0
      j = len(candidates)
      gcmstoolbox.printProgress(i, j)
      
    for c in list(candidates):
      # read the spectra in this group
      splist = []
      for s in data['groups'][c]['spectra']: 
        splist.append(data['spectra'][s])
      
      # if more than one spectrum, make sumspectrum
      if len(splist) > 1:
        sumsp = gcmstoolbox.sumspectrum(*splist, highest = options.n)
      else:
        sumsp = splist[0]
        
      # check masses
      remove = False     
      maxval = max(sumsp['xydata'].values())
      for m in options.mass:
        if str(m) in sumsp['xydata']:
          if int(sumsp['xydata'][str(m)]) > (maxval * 0.01 * options.percent):     #remove group
            if options.verbose:
              print(" --> G" + c + " m/z=" + str(m) + " y-value=" + str(sumsp['xydata'][str(m)]) + " threshold=" + str(maxval * 0.01 * options.percent))
            remove = True

      # final decission
      #if a group is tagged for removal, we need to keep it in the candidates set! if it is not tagged for removal, we eliminate it as a candidate
      if not remove:  
        candidates.discard(c)
        
      # progress bar
      if not options.verbose: 
        i += 1
        gcmstoolbox.printProgress(i, j)
      
    if options.verbose: 
      print("candidates for removal:")
      if len(candidates) == 0:
        print("  none")
      else:
        print(tabulate(candidates))
    
        
  ### UPDATE GROUPS AND WRITE IT AS JSON
  
  if 'filters' not in data:
    data['filters'] = OrderedDict()
    f = "F1"
  else:
    f = "F" + str(len(data['filters']) + 1)
    
  data['filters'][f] = OrderedDict()
  if c1: data['filters'][f]['crit1'] = ", ".join(removegroups)
  if c2: data['filters'][f]['crit2'] = str(options.count)
  if c3: data['filters'][f]['crit3'] = "m/z " + ", ".join(str(m) for m in options.mass) + "; " + str(options.percent) + "%; " + str(options.n)
  data['filters'][f]['active'] = True
  data['filters'][f]['out'] = sorted(candidates)

  print("\nFilter " + f)
  print("  - initial number of groups:  " + str( len(data['groups']) ))
  print("  - number of removed groups:  " + str( len(candidates) ))
  print("  - number of retained groups: " + str( len(data['groups']) - len(candidates) ))

  af = []
  ac = set()
  for f, filter in data['filters'].items():
    if filter['active']:
      af.append(f)
      ac.update(filter['out'])
  
  print("\nAll active filters (" + ", ".join(af) + ")")
  print("  - initial number of groups:  " + str( len(data['groups']) ))
  print("  - number of removed groups:  " + str( len(ac) ))
  print("  - number of retained groups: " + str( len(data['groups']) - len(ac) ))

  data['info']['mode'] = "filter"
  data["info"]["cmds"].append(cmd)
  gcmstoolbox.saveJSON(data, options.jsonout)     # backup and safe json
  
  print(" => Finalised. Wrote " + options.jsonout + "\n")
  exit()
Exemple #4
0
def main():
    print(
        "\n*******************************************************************************"
    )
    print(
        "* GCMStoolbox - a set of tools for GC-MS data analysis                        *"
    )
    print("*   Version: {} ({})                                             *".
          format(gcmstoolbox.version, gcmstoolbox.date))
    print(
        "*   Author:  Wim Fremout, Royal Institute for Cultural Heritage               *"
    )
    print(
        "*   Licence: GNU GPL version 3                                                *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "* EXPORT:                                                                     *"
    )
    print(
        "*   export the GCMStoolbox data file (JSON) into NIST MS SEARCH format (.msp) *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "*******************************************************************************\n"
    )

    ### OPTIONPARSER

    usage = "usage: %prog [options] MSP_FILE"

    parser = OptionParser(usage,
                          version="GCMStoolbox version " +
                          gcmstoolbox.version + " (" + gcmstoolbox.date +
                          ")\n")
    parser.add_option("-v",
                      "--verbose",
                      help="Be very verbose [not default]",
                      action="store_true",
                      dest="verbose",
                      default=False)
    parser.add_option("-i",
                      "--jsonin",
                      help="JSON input file name [default: gcmstoolbox.json]",
                      action="store",
                      dest="jsonin",
                      type="string",
                      default="gcmstoolbox.json")
    parser.add_option(
        "-o",
        "--jsonout",
        help="JSON output file name [default: same as JSON input file]",
        action="store",
        dest="jsonout",
        type="string")
    parser.add_option(
        "-m",
        "--mode",
        help="Mode: auto|spectra|group|components [default:auto]",
        action="store",
        dest="mode",
        type="string",
        default="auto")
    parser.add_option(
        "-g",
        "--group",
        help=
        "Group numbers to export in group mode; multiple instances can be defined",
        action="append",
        dest="group",
        type="string")

    (options, args) = parser.parse_args()

    ### ARGUMENTS AND OPTIONS

    cmd = " ".join(sys.argv)

    if options.verbose: print("Processing import files and options")

    # check MSP output file
    if len(args) == 0:
        print("  !! No MSP file name given\n")
        exit()
    elif len(args) != 1:
        print(
            "  !! Too many arguments. Only one MSP file name can be created.\n"
        )
        exit()
    else:
        mspfile = args[0]

    # check and read JSON input file
    data = gcmstoolbox.openJSON(options.jsonin)

    # json output
    if options.jsonout == None:
        options.jsonout = options.jsonin

    if options.verbose:
        print(" => JSON input file:  " + options.jsonin)
        print(" => JSON output file: " + options.jsonout)
        print(" => Output msp file:  " + mspfile + "\n")

    ### MODE

    if options.mode.lower().startswith('a'):
        mode = data['info']['mode']
        if mode == 'filter': mode = 'group'
    elif options.mode.lower().startswith('s'):
        mode = 'spectra'
    elif options.mode.lower().startswith('g'):
        mode = 'group'
        if data['info']['mode'] == 'spectra':
            print("  !! No groups defined - run groups.py first\n")
            exit()
        if len(options.group) == 0:
            print("  !! Group mode requires at least one group (-g)\n")
            exit()
    elif options.mode.lower().startswith('c'):
        mode = 'components'
        if data['info']['mode'] != 'components':
            print("  !! No components defined - run componentlib.py first\n")
            exit()
    else:
        print(
            "  !! Unknown mode (possible modes are 'auto', 'spectra', 'group' and 'components'\n"
        )
        exit()

    print("Mode: " + mode)

    ### WRITE FILE

    print("\nProcessing mass spectra")

    # make list of spectra to be added
    splist = OrderedDict()
    if (mode == "spectra") or (mode == "components"):
        splist = data[mode]
    elif mode == "group":
        for g in options.group:
            if 'G' + str(g) in data['groups']:
                # add original spectra to splist
                for s in data['groups']['G' + str(g)]['spectra']:
                    splist[s] = data['spectra'][s]
                # if a component exists with a sumspectrum, add this.
                if 'components' in data:
                    for c in data['components']:
                        if data['components'][c]['Group'] == 'G' + str(g):
                            splist[c] = data['components'][c]
                            break
            else:
                print(" !! G" + str(g) + " was not found.")

    with open(mspfile, "w") as fh:
        # init progress bar
        if not options.verbose:
            j = 0
            k = len(splist)
            gcmstoolbox.printProgress(j, k)

        for name, spectrum in splist.items():
            writespectrum(fh, mspfile, name, spectrum, options.verbose)

            # adjust progress bar
            if not options.verbose:
                j += 1
                gcmstoolbox.printProgress(j, k)

    print("\n => Wrote {}\n".format(mspfile))

    ### TRACE IN JSON FILE

    print("\nPut a trace in the JSON output file: " + options.jsonout + "\n")
    data = gcmstoolbox.openJSON(
        options.jsonin
    )  # reread the file to be sure we haven't accidentally messed up the data
    data['info']['cmds'].append(" ".join(
        sys.argv))  # put a trace in the data file
    gcmstoolbox.saveJSON(data, options.jsonout)  # backup and safe json

    exit()
Exemple #5
0
def main():
    print(
        "\n*******************************************************************************"
    )
    print(
        "* GCMStoolbox - a set of tools for GC-MS data analysis                        *"
    )
    print("*   Version: {} ({})                                             *".
          format(gcmstoolbox.version, gcmstoolbox.date))
    print(
        "*   Author:  Wim Fremout, Royal Institute for Cultural Heritage               *"
    )
    print(
        "*   Licence: GNU GPL version 3                                                *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "* REPORT                                                                      *"
    )
    print(
        "*   Generate CSV report of a component library                                *"
    )
    print(
        "*                                                                             *"
    )
    print(
        "*******************************************************************************\n"
    )

    ### OPTIONPARSER

    usage = "usage: %prog [options] REPORT_CSV"

    parser = OptionParser(usage,
                          version="GCMStoolbox version " +
                          gcmstoolbox.version + " (" + gcmstoolbox.date +
                          ")\n")
    parser.add_option("-v",
                      "--verbose",
                      help="Be very verbose",
                      action="store_true",
                      dest="verbose",
                      default=False)
    parser.add_option("-i",
                      "--jsonin",
                      help="JSON input file name [default: gcmstoolbox.json]",
                      action="store",
                      dest="jsonin",
                      type="string",
                      default="gcmstoolbox.json")
    parser.add_option(
        "-o",
        "--jsonout",
        help="JSON output file name [default: same as JSON input file]",
        action="store",
        dest="jsonout",
        type="string")
    parser.add_option(
        "-g",
        "--groupby",
        help=
        "Group measurements by categories (eg. Source, Sample, AAdays, Resin...)",
        action="store",
        dest="groupby",
        type="string",
        default="Source")

    (options, args) = parser.parse_args()

    ### ARGUMENTS

    cmd = " ".join(sys.argv)

    if options.verbose: print("Processing arguments...")

    # output file
    if len(args) == 0:  #exit without complaining
        print("\n!! Needs a file name for the CSV report")
        exit()
    elif len(args) == 1:
        outfile = args[0]
    else:
        print("\n!! Too many arguments")
        exit()

    # check and read JSON input file
    data = gcmstoolbox.openJSON(options.jsonin)
    if data['info']['mode'] != "components":
        print(
            "\n!! Reports can only be generated if the components have been built."
        )
        exit()

    # json output
    if options.jsonout == None:
        options.jsonout = options.jsonin

    if options.verbose:
        print(" => JSON input file:  " + options.jsonin)
        print(" => JSON output file: " + options.jsonout)
        print(" => Output msp file:  " + mspfile + "\n")

    ### READ COMPONENTS

    print("\nRunning through components...")
    report = []

    if not options.verbose:
        i = 0
        j = len(data['components'])
        gcmstoolbox.printProgress(i, j)

    for c in data['components']:
        categories = OrderedDict()
        component = data['components'][c]

        # check all spectra of a component and search for the group-by categories
        for s in component['Spectra']:
            spectrum = data['spectra'][s]

            # lookup category in spectrum (or default to unknown)
            if options.groupby in spectrum:
                cat = spectrum[options.groupby]
            else:
                cat = 'unknown'

            # spectrumIS
            if 'IS' in spectrum: spectrumIS = int(spectrum['IS'])
            else: spectrumIS = 1

            # store IS and count in categories
            if cat not in categories:
                categories[cat] = OrderedDict([('sumIS', spectrumIS),
                                               ('count', 1)])
            else:
                categories[cat]['sumIS'] += spectrumIS
                categories[cat]['count'] += 1

        # divide sumIS by the number of spectra
        for cat in categories:
            meanIS = categories[cat]['sumIS'] // categories[cat][
                'count']  #integer division!
            categories[
                cat] = meanIS  # this is what we need to report, sumIS and count can thus be overwritten

        # prepare report line for this component
        reportline = [
            "C" + component['DB#'],  # column A: component number
            len(
                component['Spectra']
            ),  # column B: number of spectra on which this group group/component was calculated
            component['RI'],  # column C: component RI
            component['dRI'],  # column D: RI difference within the component
            categories  # ordereddict with category -> mean intensities
        ]
        report.append(reportline)

        # update progress bar
        if options.verbose:
            print("  - " + c)
        else:
            i += 1
            gcmstoolbox.printProgress(i, j)

    ### CALCULATE SUM-IS

    # the sum-IS is the sum of all spectra of a given source file
    # in case a category is composed of multiple source files, the sum-IS is a the average
    # (sum of the IS values of all spectra within this category, divided by the number of sources)

    print("\nCalculate IS for each " + options.groupby + "...")

    if not options.verbose:
        i = 0
        j = len(data['spectra'])
        gcmstoolbox.printProgress(i, j)

    # compile a list of all group-by categories
    categories = set()
    for line in report:
        categories.update(line[4].keys())
    categories = sorted(categories)  #convert to sorted list

    # calculate sumIS and count for each category
    catIS = dict()
    catSpectra = dict()
    catSources = dict()

    for spectrum in data['spectra'].values():
        if options.groupby in spectrum:
            cat = spectrum[options.groupby]
        else:
            cat = 'unknown'

        # spectrumIS
        if 'IS' in spectrum: spectrumIS = int(spectrum['IS'])
        else: spectrumIS = 1

        # store IS and count in categories
        if cat not in catIS:
            catIS[cat] = spectrumIS
            catSpectra[cat] = 1
            catSources[cat] = set()
        else:
            catIS[cat] += spectrumIS
            catSpectra[cat] += 1
        catSources[cat].add(spectrum['Source'])

        # update progress bar
        if options.verbose:
            print("  - S{}: category {} (#{})--> added {} to summed IS".format(
                spectrum['DB#'], cat, catSpectra[cat], spectrumIS))
        else:
            i += 1
            gcmstoolbox.printProgress(i, j)

    # calculate mean IS
    for cat in categories:
        # count sources per category
        catSources[cat] = len(catSources[cat])
        # calculate average sum-IS
        catIS[cat] = catIS[cat] // catSources[cat]

    ### MAKE REPORT

    print("\nGenerating report...")

    if not options.verbose:
        i = 0
        j = len(report)
        gcmstoolbox.printProgress(i, j)

    # write report file
    with open(outfile, 'w', newline='') as fh:
        mkreport = csv.writer(fh, dialect='excel')

        # write header rows
        mkreport.writerow(["component", "number of spectra", "RI", "dRI"] +
                          categories)
        mkreport.writerow(["(average sum-IS)", "", "", ""] +
                          [catIS[cat] for cat in categories])
        mkreport.writerow(["(number of spectra)", "", "", ""] +
                          [catSpectra[cat] for cat in categories])
        mkreport.writerow(["(number of sources)", "", "", ""] +
                          [catSources[cat] for cat in categories])

        # next rows: components
        for row in report:
            # the last item in a report item (row) is a dict of categories and mean IS
            # replace it with a complete and sorted list of mean IS'es
            catIS = row.pop()
            for cat in categories:
                if cat in catIS: row.append(catIS[cat])
                else: row.append("")
            # write row to report
            mkreport.writerow(row)

            if not options.verbose:
                i += 1
                gcmstoolbox.printProgress(i, j)

    print("\n => Wrote {}\n".format(outfile))

    ### TRACE IN JSON FILE

    print("\nPut a trace in the JSON output file: " + options.jsonout + "\n")
    data = gcmstoolbox.openJSON(
        options.jsonin
    )  # reread the file to be sure we haven't accidentally messed up the data
    data['info']['cmds'].append(" ".join(
        sys.argv))  # put a trace in the data file
    gcmstoolbox.saveJSON(data, options.jsonout)  # backup and safe json

    exit()
Exemple #6
0
def main():
  print("\n*******************************************************************************")
  print(  "* GCMStoolbox - a set of tools for GC-MS data analysis                        *")
  print(  "*   Version: {} ({})                                             *".format(gcmstoolbox.version, gcmstoolbox.date))
  print(  "*   Author:  Wim Fremout, Royal Institute for Cultural Heritage               *")
  print(  "*   Licence: GNU GPL version 3                                                *")
  print(  "*                                                                             *")
  print(  "* BUILD                                                                       *")
  print(  "*   Builds the component spectra                                              *") 
  print(  "*                                                                             *")
  print(  "*******************************************************************************\n")

  ### OPTIONPARSER
  
  usage = "usage: %prog [options]"
  
  parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n")
  parser.add_option("-v", "--verbose",  help="Be very verbose",  action="store_true", dest="verbose", default=False)
  parser.add_option("-i", "--jsonin",  help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json")
  parser.add_option("-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string")
  parser.add_option("-c", "--cnumber",  help="Start number for component numbers", action="store", dest="c", type="int" , default=1)
  parser.add_option("-p", "--preserve", help="Preserve group numbers", action="store_true", dest="preserve", default=False)
  parser.add_option("-s", "--sum",      help="Calculate sumspectra with the N spectra with highest signal, 0 for all [default: 0]", action="store",  dest="n", type="int", default=0)

  (options, args) = parser.parse_args()
  

  ### ARGUMENTS

  cmd = " ".join(sys.argv)

  if options.verbose: print("Processing arguments...")
  
  # check number of arguments
  if len(args) != 0: #exit without complaining
    print("\n!! Too many arguments")
    exit()
  
  # check and read JSON input file
  data = gcmstoolbox.openJSON(options.jsonin)
  if data['info']['mode'] == 'spectra':
    print("\n!! Cannot build components using ungrouped spectra.")
    exit()
  
  # json output 
  if options.jsonout == None: 
    options.jsonout = options.jsonin

  if options.verbose:
    print(" => JSON input file:  " + options.jsonin)
    print(" => JSON output file: " + options.jsonout + "\n")

  # preserve and c number flags cannot be used together
  if options.preserve and (options.c != 1):
    print("\n!! The options -c (--cnumber) and -p (--preserve) cannot be used together.")
    exit()


  ### APPLY ACTIVE FILTERS
  
  print("\nApply filters...")
  if not options.verbose: 
    i = 0
    j = len(data['filters'])
    gcmstoolbox.printProgress(i, j)
  
  out = set()
  for id, f in data['filters'].items():
    if f['active']:
      out.update(f['out'])
      if options.verbose: print(" - add " + id)
    if not options.verbose: 
      i += 1
      gcmstoolbox.printProgress(i, j)


  ### BUILD COMPONENTS

  print("\nBuild components...")
  
  i = 0  # we'll use this both for the progress bar and for the component number (i + options.c, if options.preserve is false)
  #report = []
  data['components'] = OrderedDict()
  
  # to sort components on RI, we'll make an intermediary groups dict (ri: groupname)
  groups = []
  ris = []
  for gid, group in data['groups'].items():
    if gid not in out: #apply filters
      if 'minRI' in group:
        # find position
        gri = float(group['minRI'])
        pos = 0
        for r in ris:
          if r <= gri:
            pos += 1 
          else: 
            break
        
        # add to groups and ris
        groups.insert(pos, gid)
        ris.insert(pos, gri)
      else: #group without minRI: add to the back of the groups list
        groups.append(gid)

  # init progress bar
  if not options.verbose: 
    j = len(groups)
    gcmstoolbox.printProgress(i, j)

  # build components from the groups
  for g in groups:
    # init
    group = data['groups'][g]
    groupspectra = []
    
    # group or component numbering:
    if not options.preserve: c = i + options.c
    else:                    c = int(g.replace('G', ''))
  
    # collect the spectra
    for s in group['spectra']:
      #if not options.elinc: csvSpectra.append(s)
      groupspectra.append(data['spectra'][s])
  
    # if more than one spectrum, make sumspectrum
    if len(groupspectra) > 1:
      sp = gcmstoolbox.sumspectrum(*groupspectra, highest=options.n)
    else:
      sp = deepcopy(groupspectra[0])
      
    # rebuild the spectra metadata (and change for single spectra things)
    name = "C{} RI{}".format(str(c), str(round(float(sp['RI']))))
    sp['DB#'] = str(c)
    sp['Group'] = g
    sp['Spectra'] = group['spectra']
    
    for item in ["Source", "Sample", "Resin", "AAdays", "Color", "PyTemp"]:
      values = set()
      for s in groupspectra:
        if item in s:
          values.add(s[item])
      
      if len(values) > 0:
        # store as list in component
        sp[item] = sorted(values)

        # and add it to the component name
        if item == "AAdays":
          valuesInt = [ int(x) for x in values ]
          valuesInt = sorted(valuesInt)
          # condense the list of AAdays into sequences (0,2,4,8,32 becomes 0-8,32)
          seq = []
          days = [0, 2, 4, 8, 16, 32, 64]
          k = 0
          for low in days:
            if low in valuesInt:        #lower limit of sequence
              seq.insert(k, str(low))
              valuesInt.remove(low)
              found = False
              for high in days:   
                if high > low:
                  if high in valuesInt: #higher limit of sequence
                    found = high
                    valuesInt.remove(high)
                  else:
                    break
              if found: seq[k] += "-" + str(found)
              k += 1
          # add possible AAdays values other than 0,2,4,8...
          for x in valuesInt: seq.append(str(x)) 
          name += " " + ",".join(seq) + "d"
        elif item == "Color":
          name += " " + "/".join(sorted(values))
        elif item == "Source":
          pass
        elif item == "Sample":
          pass
        else:
          name += " " + "-".join(sorted(values))

    # crop name (longer names cause problems in Amdis)
    name = name[:77]
    
    # add to data
    data['components'][name] = sp
    
    # add a "link" to the group data
    # (used to include sumspectrum if a group library is exported) 
    # commented out, because this data remains present when components are built multiple times --> becomes ambiguous!!
    #data['groups'][g]['component'] = name

    i += 1
    
    # update progress bar
    if options.verbose:
      print("  - " + name)
    else:
      gcmstoolbox.printProgress(i, j)


   ### SAVE OUTPUT JSON
   
  print("\nSaving data...")
  
  data['info']['mode'] = "components"
  data["info"]["cmds"].append(cmd)
  gcmstoolbox.saveJSON(data, options.jsonout)     # backup and safe json
  
  print(" => Wrote " + options.jsonout + "\n")
  exit()