Exemple #1
0
def enter():
    global zoro, running, map, hurdle, jelly, score, font

    zoro = ZORO()
    map = MAP()
    hurdle = Hurdle1().create()
    jelly = Jelly().create()
    score = Score()
    font = load_font('ENCR10B.TTF')
def enter():
    global girl, stage1_monster, stage2_monster, stage3_monster, stage4_monster, stage2_trap, stage1_map, stage2_map, stage3_map
    global hp_gauge, stage2_monster_attack, stage3_monster_attack, skill, current_time, Level, protected, stage4_map, score

    #다시시작 할때를 위한 객체 위치 초기화
    MONSTER.monster_positionX = 0
    MONSTER.monster_positionY = 0
    TRAP.trap_positionX = 0
    TRAP.trap_positionY = 0
    ITEM.protected_positionX = 0
    ITEM.protected_positionY = 0
    ITEM.protect_State = 0

    Level = 0
    score = 0

    girl = Character.Character()

    stage1_monster = [MONSTER.Stage1_Monster() for i in range(20)]
    stage2_monster = [MONSTER.Stage2_Monster() for i in range(20)]
    stage3_monster = [MONSTER.Stage3_Monster() for i in range(20)]
    stage4_monster = [MONSTER.Stage4_Monster() for i in range(40)]

    stage2_monster_attack = [Stage2_Attack() for i in range(10)]
    stage3_monster_attack = [Stage3_Attack() for i in range(15)]

    stage2_trap = [TRAP.Trap() for i in range(13)]

    stage4_map = MAP.Map('image\MAP\MAP(STAGE4)_450x1200.png',
                         'music\stage4BGM.mp3', 225, 25425)
    stage3_map = MAP.Map('image\MAP\MAP(STAGE3)_450x750.png',
                         'music\stage3BGM.mp3', 225, 18000)
    stage2_map = MAP.Map('image\MAP\MAP(STAGE2)_450x750.png',
                         'music\stage2BGM.mp3', 225, 10800)
    stage1_map = MAP.Map('image\MAP\MAP(STAGE1)_450x750.png',
                         'music\stage1BGM.mp3', 225, 3600)

    hp_gauge = GAUGE.Gauge()
    skill = Skill()

    protected = [ITEM.Protected() for i in range(5)]

    current_time = get_time()
Exemple #3
0
def enter():
    global alch, zoro, running, map, hurdle, jelly, score, font, ruppy, boom, crr, jelly_sound, alch_sound, crr_sound
    zoro = ZORO()
    map = MAP()
    alch = Alch().create()
    crr = Crr().create()
    boom = Boom().create()
    ruppy = Ruppy().create()
    hurdle = Hurdle1().create()
    jelly = Jelly().create()
    score = Score()
    font = load_font('ENCR10B.TTF')
    jelly_sound = Jelly()
    alch_sound = Alch()
    crr_sound = Crr()
Exemple #4
0
def lr_tune(train_data, validation_data,val_true_list,regParam,netParam):
    # initial
    min_error = float('inf')
    best_reg1 = None
    best_net1 = None
    best_model_rmse = None
    max_map = 0.0
    best_reg2 = None
    best_net2 = None
    best_model_map = None

    for reg in regParam:
        for net in netParam:
            lr = LinearRegression(featuresCol='idf_features',labelCol='rating',regParam=reg, elasticNetParam=net,maxIter=200)
            model = lr.fit(train_data)
            predictions = model.transform(validation_data)
            predictions = predictions.withColumn('prediction', when(predictions['prediction'] < 0, 0).otherwise(predictions['prediction']))

            # rmse
            evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction')
            rmse = evaluator.evaluate(predictions)
            if rmse < min_error:
                min_error = rmse
                best_reg1 = reg
                best_net1 = net
                best_model_rmse = model

            # MAP top 25
            window = Window.partitionBy(predictions['user_id']).orderBy(predictions['prediction'].desc())
            top_predictions = predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 25)
            current_map = MAP.getMAP(top_predictions, val_true_list)
            if current_map > max_map:
                max_map = current_map
                best_reg2 = reg
                best_net2 = net
                best_model_map = model

            print('regParam = {} with elasticNetParam = {}: validation RMSE is {} validation MAP is {}'.format(reg, net, rmse, current_map))
    
    print('The best model select by RMSE has regParam = {} with elasticNetParam = {}: RMSE = {}'.format(best_reg1, best_net1, min_error))
    print('The best model select by MAP has regParam = {} with elasticNetParam = {}: MAP = {}'.format(best_reg2, best_net2, max_map))
    
    return best_model_rmse,best_model_map
Exemple #5
0
    mapGenerator.generate_levels(1, 2)
    SETTINGS.levels_list = SETTINGS.glevels_list

    gameLoad.get_canvas_size()

    #Setup and classes

    text = TEXT.Text(0, 0, "YOU  WON", SETTINGS.WHITE, "DUGAFONT.ttf", 48)
    beta = TEXT.Text(5, 5, "META  ALFA  BUILD  V.1.6", SETTINGS.WHITE,
                     "DUGAFONT.ttf", 20)
    text.update_pos(
        SETTINGS.canvas_actual_width / 2 - text.layout.get_width() / 2,
        SETTINGS.canvas_target_height / 2 - text.layout.get_height() / 2)

    #Classes for later use
    gameMap = MAP.Map(SETTINGS.levels_list[SETTINGS.current_level].array)
    gameCanvas = Canvas(SETTINGS.canvas_map_width, SETTINGS.canvas_map_height)
    gamePlayer = PLAYER.Player(SETTINGS.player_pos)
    gameRaycast = RAYCAST.Raycast(gameCanvas.canvas, gameCanvas.window)
    gameInv = INVENTORY.inventory({'bullet': 150, 'shell': 25, 'ferromag': 50})
    gameHUD = HUD.hud()

    #More loading - Level specific
    gameLoad.load_new_level()

    #Controller classes
    menuController = MENU.Controller(gameCanvas.window)
    musicController = MUSIC.Music()
    tutorialController = TUTORIAL.Controller()

    #Run at last
Exemple #6
0
def enter():
    global zoro, running, map
    zoro = ZORO()
    map = MAP()
Exemple #7
0
    def cg(self, force=False, com=False):
        # Generate the coarse grained structure
        # Set the b-factor field to something that reflects the secondary structure

        # If the coarse grained structure is set already, just return,
        # unless regeneration is forced.
        if self._cg and not force:
            return self._cg
        self._cg = []
        atid     = 1
        bb       = [1]
        fail     = False
        previous = ''
        for residue, rss, resname in zip(self.residues, self.sstypes, self.sequence):
            # For DNA we need to get the O3' to the following residue when calculating COM
            # The force and com options ensure that this part does not affect itp generation or anything else
            if com:
                # Just an initialization, this should complain if it isn't updated in the loop
                store = 0
                for ind, i in enumerate(residue):
                    if i[0] == "O3'":
                        if previous != '':
                            residue[ind] = previous
                            previous = i
                        else:
                            store = ind
                            previous = i
                # We couldn't remove the O3' from the 5' end residue during the loop so we do it now
                if store > 0:
                    del residue[store]

            # Check if residues names has changed, for example because user has set residues interactively.
            residue = [(atom[0], resname)+atom[2:] for atom in residue]
            if residue[0][1] in ("SOL", "HOH", "TIP"):
                continue
            if not residue[0][1] in MAP.CoarseGrained.mapping.keys():
                logging.warning("Skipped unknown residue %s\n" % residue[0][1])
                continue
            # Get the mapping for this residue
            # CG.map returns bead coordinates and mapped atoms
            # This will fail if there are (too many) atoms missing, which is
            # only problematic if a mapped structure is written; the topology
            # is inferred from the sequence. So this is the best place to raise
            # an error
            try:
                beads, ids = MAP.map(residue, ca2bb=self.options['ForceField'].ca2bb)
                beads      = zip(MAP.CoarseGrained.names[residue[0][1]], beads, ids)
                if residue[0][1] in self.options['ForceField'].polar:
                    beads = add_dummy(beads, dist=0.14, n=2)
                elif residue[0][1] in self.options['ForceField'].charged:
                    beads = add_dummy(beads, dist=0.11, n=1)
            except ValueError:
                logging.error("Too many atoms missing from residue %s %d(ch:%s):",
                              residue[0][1], residue[0][2]-(32 << 20), residue[0][3])
                logging.error(repr([i[0] for i in residue]))
                fail = True

            for name, (x, y, z), ids in beads:
                # Add the bead with coordinates and secondary structure id to the list
                self._cg.append((name, residue[0][1][:3], residue[0][2], residue[0][3], x, y, z, SS.ss2num[rss]))
                # Add the ids to the list, after converting them to indices to the list of atoms
                self.mapping.append([atid+i for i in ids])

            # Increment the atom id; This pertains to the atoms that are included in the output.
            atid += len(residue)

            # Keep track of the numbers for CONECTing
            bb.append(bb[-1]+len(beads))

        if fail:
            logging.error("Unable to generate coarse grained structure due to missing atoms.")
            sys.exit(1)

        return self._cg
Exemple #8
0
class BRAIN(MP):
    def init(self):
        self.map = MAP(self.md)

    def explore(self):
        # find most unexplored area, and drive there
        pass

    def run_impl(self):
        # first, check bumper
        if "Bumper" in self.md:
            print "==========================================================="
            print "============  BUMPER           ============================"
            print "==========================================================="
            print "==========================================================="
            # hardcoded evade behavior
            self.md["Move"] = [0.2, "backward"]
            time.sleep(2)
            self.md["Move"] = [0.2, "left"]
            time.sleep(1)

            del self.md["Bumper"]

            #clean up from this round
            del self.md["lidar_points"]
            return

        # next, if we have odometry, save and react to laser input
        if "MCS" in self.md:
            self.md["WCS"] = self.md["MCS"]

            if "lidar_points" in self.md:
                # build map using laser points
                self.map.integrate(self.md["WCS"], self.md["lidar_points"])
                #self.map.visualize(((getMs()-self.md["starttime"]) / 1000.),save=True)
                # save a snapshot of relevant data to files
                '''
                tsnow = getMs()
                np.save("/tmp/%d_LIDAR"%tsnow, self.md["lidar_points"])
                coord = np.array([self.md["WCS"].x, self.md["WCS"].y, self.md["WCS"].a])
                np.save("/tmp/%d_WCS"%tsnow,coord)
                np.save("/tmp/%d_MAPPOINTS"%tsnow, self.map.mappoints)
                np.save("/tmp/%d_TILES"%tsnow, self.map.tiles)
                '''

                free = True

                for i in range(10, -10, -1):
                    m = self.md["lidar"][i]
                    if np.isnan(m):
                        print ".",
                    elif m > 100 and m < 500:
                        print "X",
                        free = False
                    else:
                        print "O",

                    print " ",
                print

                if not free:
                    self.md["Move"] = [0.1, "left"]  # was 0-100, now 0-255
                else:
                    self.md["Move"] = [0.1, "forward"]

                del self.md["lidar_points"]
                '''
                # try to drive to the least known map cell
                target_coords = np.unravel_index(np.argmin(self.map.tiles), self.map.tiles.shape)
                print "Minimal knowledge at tile: "
                tmp = self.map.tile2coordm(target_coords[1], target_coords[0])
                self.md["target"] = Coordinate(tmp[0], tmp[1], 0)
                self.map.wcs2rcs(self.md["target"])
                # first, rotate until we are facing it
                # transfer map tile (WCS) to RCS to calculate relative angle
                '''

        #time.sleep(0.1)
        '''
Exemple #9
0
 def init(self):
     self.map = MAP(self.md)
Exemple #10
0
def main(options):
    # Check whether to read from a gro/pdb file or from stdin
    # We use an iterator to wrap around the stream to allow
    # inferring the file type, without consuming lines already
    inStream = IO.streamTag(options["-f"] and options["-f"].value or sys.stdin)

    # The streamTag iterator first yields the file type, which
    # is used to specify the function for reading frames
    fileType = inStream.next()
    if fileType == "GRO":
        frameIterator = IO.groFrameIterator
    else:
        frameIterator = IO.pdbFrameIterator

    # ITERATE OVER FRAMES IN STRUCTURE FILE #

    # Now iterate over the frames in the stream
    # This should become a StructureFile class with a nice .next method
    model = 1
    cgOutPDB = None
    ssTotal = []
    cysteines = []
    for title, atoms, box in frameIterator(inStream):

        if fileType == "PDB":
            # The PDB file can have chains, in which case we list and process them specifically
            # TER statements are also interpreted as chain separators
            # A chain may have breaks in which case the breaking residues are flagged
            chains = [
                IO.Chain(options, [i for i in IO.residues(chain)])
                for chain in IO.pdbChains(atoms)
            ]
        else:
            # The GRO file does not define chains. Here breaks in the backbone are
            # interpreted as chain separators.
            residuelist = [residue for residue in IO.residues(atoms)]
            # The breaks are indices to residues
            broken = IO.breaks(residuelist)
            # Reorder, such that each chain is specified with (i,j,k)
            # where i and j are the start and end of the chain, and
            # k is a chain identifier
            chains = zip([0] + broken, broken + [len(residuelist)],
                         range(len(broken) + 1))
            chains = [
                IO.Chain(options, residuelist[i:j], name=chr(65 + k))
                for i, j, k in chains
            ]

        for chain in chains:
            chain.multiscale = "all" in options[
                'multi'] or chain.id in options['multi']

        # Check the chain identifiers
        if model == 1 and len(chains) != len(set([i.id for i in chains])):
            # Ending down here means that non-consecutive blocks of atoms in the
            # PDB file have the same chain ID. The warning pertains to PDB files only,
            # since chains from GRO files get a unique chain identifier assigned.
            logging.warning(
                "Several chains have identical chain identifiers in the PDB file."
            )

        # Check if chains are of mixed type. If so, split them.
        # Note that in some cases HETATM residues are part of a
        # chain. This will get problematic. But we cannot cover
        # all, probably.
        if not options['MixedChains']:
            demixedChains = []
            for chain in chains:
                demixedChains.extend(chain.split())
            chains = demixedChains

        n = 1
        logging.info("Found %d chains:" % len(chains))
        for chain in chains:
            logging.info("  %2d:   %s (%s), %d atoms in %d residues." %
                         (n, chain.id, chain._type, chain.natoms, len(chain)))
            n += 1

        # Check all chains
        keep = []
        for chain in chains:
            if chain.type() == "Water":
                logging.info("Removing %d water molecules (chain %s)." %
                             (len(chain), chain.id))
            elif chain.type() in ("Protein", "Nucleic"):
                keep.append(chain)
            # This is currently not active:
            elif options['RetainHETATM']:
                keep.append(chain)
            else:
                logging.info(
                    "Removing HETATM chain %s consisting of %d residues." %
                    (chain.id, len(chain)))
        chains = keep

        # Here we interactively check the charge state of resides
        # Can be easily expanded to residues other than HIS
        for chain in chains:
            for i, resname in enumerate(chain.sequence):
                if resname == 'HIS' and options['chHIS']:
                    choices = {0: 'HIH', 1: 'HIS'}
                    choice = IO.getChargeType(resname, i, choices)
                    chain.sequence[i] = choice

        # Check which chains need merging
        if model == 1:
            order, merge = IO.check_merge(
                chains, options['mergeList'], options['linkList'],
                options['CystineCheckBonds'] and options['CystineMaxDist2'])

        # Get the total length of the sequence
        seqlength = sum([len(chain) for chain in chains])
        logging.info('Total size of the system: %s residues.' % seqlength)

        ## SECONDARY STRUCTURE
        ss = ''
        if options['Collagen']:
            for chain in chains:
                chain.set_ss("F")
                ss += chain.ss
        elif options["-ss"]:
            # XXX We need error-catching here,
            # in case the file doesn't excist, or the string contains bogus.
            # If the string given for the sequence consists strictly of upper case letters
            # and does not appear to be a file, assume it is the secondary structure
            ss = options["-ss"].value.replace('~', 'L').replace(' ', 'L')
            if ss.isalnum() and ss.isupper() and not os.path.exists(
                    options["-ss"].value):
                ss = options["-ss"].value
                logging.info('Secondary structure read from command-line:\n' +
                             ss)
            else:
                # There ought to be a file with the name specified
                ssfile = [i.strip() for i in open(options["-ss"].value)]

                # Try to read the file as a Gromacs Secondary Structure Dump
                # Those have an integer as first line
                if ssfile[0].isdigit():
                    logging.info(
                        'Will read secondary structure from file (assuming Gromacs ssdump).'
                    )
                    ss = "".join([i for i in ssfile[1:]])
                else:
                    # Get the secondary structure type from DSSP output
                    logging.info(
                        'Will read secondary structure from file (assuming DSSP output).'
                    )
                    pss = re.compile(r"^([ 0-9]{4}[0-9]){2}")
                    ss = "".join([
                        i[16] for i in open(options["-ss"].value)
                        if re.match(pss, i)
                    ])

            # Now set the secondary structure for each of the chains
            sstmp = ss
            for chain in chains:
                ln = min(len(sstmp), len(chain))
                chain.set_ss(sstmp[:ln])
                sstmp = ss[:ln]
        else:
            if options["-dssp"]:
                method, executable = "dssp", options["-dssp"].value
            #elif options["-pymol"]:
            #    method, executable = "pymol", options["-pymol"].value
            else:
                logging.warning(
                    "No secondary structure or determination method speficied. Protein chains will be set to 'COIL'."
                )
                method, executable = None, None

            for chain in chains:
                ss += chain.dss(method, executable)

            # Used to be: if method in ("dssp","pymol"): but pymol is not supported
            if method in ["dssp"]:
                logging.debug('%s determined secondary structure:\n' %
                              method.upper() + ss)

        # Collect the secondary structure classifications for different frames
        ssTotal.append(ss)

        # Write the coarse grained structure if requested
        if options["-x"].value:
            logging.info("Writing coarse grained structure.")
            if cgOutPDB is None:
                cgOutPDB = open(options["-x"].value, "w")
            cgOutPDB.write("MODEL %8d\n" % model)
            cgOutPDB.write(title)
            cgOutPDB.write(IO.pdbBoxString(box))
            atid = 1
            for i in order:
                ci = chains[i]
                if ci.multiscale:
                    for r in ci.residues:
                        for name, resn, resi, chain, x, y, z in r:
                            cgOutPDB.write(
                                IO.pdbOut(
                                    (name, resn[:3], resi, chain, x, y, z),
                                    i=atid))
                            atid += 1
                coarseGrained = ci.cg(com=True)
                if coarseGrained:
                    for name, resn, resi, chain, x, y, z, ssid in coarseGrained:
                        if ci.multiscale:
                            name = "v" + name
                        cgOutPDB.write(
                            IO.pdbOut((name, resn[:3], resi, chain, x, y, z),
                                      i=atid,
                                      ssid=ssid))
                        atid += 1
                    cgOutPDB.write("TER\n")
                else:
                    logging.warning(
                        "No mapping for coarse graining chain %s (%s); chain is skipped."
                        % (ci.id, ci.type()))
            cgOutPDB.write("ENDMDL\n")

        # Gather cysteine sulphur coordinates
        cyslist = [cys["SG"] for chain in chains for cys in chain["CYS"]]
        cysteines.append([cys for cys in cyslist if cys])

        model += 1

    # Write the index file if requested.
    # Mainly of interest for multiscaling.
    # Could be improved by adding separte groups for BB, SC, etc.
    if options["-n"].value:
        logging.info("Writing index file.")
        # Lists for All-atom, Virtual sites and Coarse Grain.
        NAA, NVZ, NCG = [], [], []
        atid = 1
        for i in order:
            ci = chains[i]
            coarseGrained = ci.cg(force=True)
            if ci.multiscale:
                NAA.extend([" %5d" % (a + atid) for a in range(ci.natoms)])
                atid += ci.natoms
            if coarseGrained:
                if ci.multiscale:
                    NVZ.extend([
                        " %5d" % (a + atid) for a in range(len(coarseGrained))
                    ])
                else:
                    NCG.extend([
                        " %5d" % (a + atid) for a in range(len(coarseGrained))
                    ])
                atid += len(coarseGrained)
        outNDX = open(options["-n"].value, "w")
        outNDX.write("\n[ AA ]\n" + "\n".join(
            [" ".join(NAA[i:i + 15]) for i in range(0, len(NAA), 15)]))
        outNDX.write("\n[ VZ ]\n" + "\n".join(
            [" ".join(NVZ[i:i + 15]) for i in range(0, len(NVZ), 15)]))
        outNDX.write("\n[ CG ]\n" + "\n".join(
            [" ".join(NCG[i:i + 15]) for i in range(0, len(NCG), 15)]))
        outNDX.close()

    # Write the index file for mapping AA trajectory if requested
    if options["-nmap"].value:
        logging.info("Writing trajectory index file.")
        atid = 1
        outNDX = open(options["-nmap"].value, "w")
        # Get all AA atoms as lists of atoms in residues
        # First we skip hetatoms and unknowns then iterate over beads
        # In DNA the O3' atom is mapped together with atoms from the next residue
        # This stores it until we get to the next residue
        o3_shift = ''
        for i_count, i in enumerate(IO.residues(atoms)):
            if i[0][1] in ("SOL", "HOH", "TIP"):
                continue
            if not i[0][1] in MAP.CoarseGrained.mapping.keys():
                continue
            nra = 0
            names = [j[0] for j in i]
            # This gives out a list of atoms in residue, each tuple has other
            # stuff in it that's needed elsewhere so we just take the last
            # element which is the atom index (in that residue)
            for j_count, j in enumerate(MAP.mapIndex(i)):
                outNDX.write('[ Bead %i of residue %i ]\n' %
                             (j_count + 1, i_count + 1))
                line = ''
                for k in j:
                    if names[k[2]] == "O3'":
                        line += '%s ' % (str(o3_shift))
                        o3_shift = k[2] + atid
                    else:
                        line += '%i ' % (k[2] + atid)
                line += '\n'
                nra += len(j)
                outNDX.write(line)
            atid += nra

    # Evertything below here we only need, if we need to write a Topology
    if options['-o']:

        # Collect the secondary structure stuff and decide what to do with it
        # First rearrange by the residue
        ssTotal = zip(*ssTotal)
        ssAver = []
        for i in ssTotal:
            si = list(set(i))
            if len(si) == 1:
                # Only one type -- consensus
                ssAver.append(si[0])
            else:
                # Transitions between secondary structure types
                i = list(i)
                si = [(1.0 * i.count(j) / len(i), j) for j in si]
                si.sort()
                if si[-1][0] > options["-ssc"].value:
                    ssAver.append(si[-1][1])
                else:
                    ssAver.append(" ")

        ssAver = "".join(ssAver)
        logging.info(
            '(Average) Secondary structure has been determined (see head of .itp-file).'
        )

        # Divide the secondary structure according to the division in chains
        # This will set the secondary structure types to be used for the
        # topology.
        for chain in chains:
            chain.set_ss(ssAver[:len(chain)])
            ssAver = ssAver[len(chain):]

        # Now the chains are complete, each consisting of a residuelist,
        # and a secondary structure designation if the chain is of type 'Protein'.
        # There may be mixed chains, there may be HETATM things.
        # Water has been discarded. Maybe this has to be changed at some point.
        # The order in the coarse grained files matches the order in the set of chains.
        #
        # If there are no merges to be done, i.e. no global Elnedyn network, no
        # disulphide bridges, no links, no distance restraints and no explicit merges,
        # then we can write out the topology, which will match the coarse grained file.
        #
        # If there are merges to be done, the order of things may be changed, in which
        # case the coarse grained structure will not match with the topology...

        # CYSTINE BRIDGES #
        # Extract the cysteine coordinates (for all frames) and the cysteine identifiers
        if options['CystineCheckBonds']:
            logging.info(
                "Checking for cystine bridges, based on sulphur (SG) atoms lying closer than %.4f nm"
                % math.sqrt(options['CystineMaxDist2'] / 100))

            cyscoord = zip(*[[j[4:7] for j in i] for i in cysteines])
            cysteines = [i[:4] for i in cysteines[0]]

            bl, kb = options['ForceField'].special[(("SC1", "CYS"), ("SC1",
                                                                     "CYS"))]

            # Check the distances and add the cysteines to the link list if the
            # SG atoms have a distance smaller than the cutoff.
            rlc = range(len(cysteines))
            for i in rlc[:-1]:
                for j in rlc[i + 1:]:
                    # Checking the minimum distance over all frames
                    # But we could also take the maximum, or the mean
                    d2 = min([
                        FUNC.distance2(a, b)
                        for a, b in zip(cyscoord[i], cyscoord[j])
                    ])
                    if d2 <= options['CystineMaxDist2']:
                        a, b = cysteines[i], cysteines[j]
                        options['linkListCG'].append(
                            (("SC1", "CYS", a[2], a[3]), ("SC1", "CYS", b[2],
                                                          b[3]), bl, kb))
                        a, b = (a[0], a[1], a[2] - (32 << 20),
                                a[3]), (b[0], b[1], b[2] - (32 << 20), b[3])
                        logging.info(
                            "Detected SS bridge between %s and %s (%f nm)" %
                            (a, b, math.sqrt(d2) / 10))

        # REAL ITP STUFF #
        # Check whether we have identical chains, in which case we
        # only write the ITP for one...
        # This means making a distinction between chains and
        # moleculetypes.

        molecules = [tuple([chains[i] for i in j]) for j in merge]

        # At this point we should have a list or dictionary of chains
        # Each chain should be given a unique name, based on the value
        # of options["-o"] combined with the chain identifier and possibly
        # a number if there are chains with identical identifiers.
        # For each chain we then write an ITP file using the name for
        # moleculetype and name + ".itp" for the topology include file.
        # In addition we write a master topology file, using the value of
        # options["-o"], with an added extension ".top" if not given.

        # XXX *NOTE*: This should probably be gathered in a 'Universe' class
        itp = 0
        moleculeTypes = {}
        for mi in range(len(molecules)):
            mol = molecules[mi]
            # Check if the moleculetype is already listed
            # If not, generate the topology from the chain definition
            if mol not in moleculeTypes or options['SeparateTop']:
                # Name of the moleculetype
                # XXX: The naming should be changed; now it becomes Protein_X+Protein_Y+...
                name = "+".join(
                    [chain.getname(options['-name'].value) for chain in mol])
                moleculeTypes[mol] = name

                # Write the molecule type topology
                top = TOP.Topology(mol[0], options=options, name=name)
                for m in mol[1:]:
                    top += TOP.Topology(m, options=options)

                # Have to add the connections, like the connecting network
                # Gather coordinates
                mcg, coords = zip(*[(j[:4], j[4:7]) for m in mol
                                    for j in m.cg(force=True)])
                mcg = list(mcg)

                # Run through the link list and add connections (links = cys bridges or hand specified links)
                for atomA, atomB, bondlength, forceconst in options[
                        'linkListCG']:
                    if bondlength == -1 and forceconst == -1:
                        bondlength, forceconst = options['ForceField'].special[
                            (atomA[:2], atomB[:2])]
                    # Check whether this link applies to this group
                    atomA = atomA in mcg and mcg.index(atomA) + 1
                    atomB = atomB in mcg and mcg.index(atomB) + 1
                    if atomA and atomB:
                        cat = (forceconst is None) and "Constraint" or "Link"
                        top.bonds.append(
                            TOP.Bond((atomA, atomB),
                                     options=options,
                                     type=1,
                                     parameters=(bondlength, forceconst),
                                     category=cat,
                                     comments="Cys-bonds/special link"))

                # Elastic Network
                # The elastic network is added after the topology is constructed, since that
                # is where the correct atom list with numbering and the full set of
                # coordinates for the merged chains are available.
                if options['ElasticNetwork']:
                    rubberType = options['ForceField'].EBondType
                    rubberList = ELN.rubberBands(
                        [(i[0], j) for i, j in zip(top.atoms, coords)
                         if i[4] in options['ElasticBeads']],
                        options['ElasticLowerBound'],
                        options['ElasticUpperBound'],
                        options['ElasticDecayFactor'],
                        options['ElasticDecayPower'],
                        options['ElasticMaximumForce'],
                        options['ElasticMinimumForce'])
                    top.bonds.extend([
                        TOP.Bond(i,
                                 options=options,
                                 type=rubberType,
                                 category="Rubber band") for i in rubberList
                    ])

                # Write out the MoleculeType topology
                destination = options["-o"] and open(
                    moleculeTypes[mol] + ".itp", 'w') or sys.stdout
                destination.write(str(top))

                itp += 1

            # Check whether other chains are equal to this one
            # Skip this step if we are to write all chains to separate moleculetypes
            if not options['SeparateTop']:
                for j in range(mi + 1, len(molecules)):
                    if not molecules[j] in moleculeTypes and mol == molecules[
                            j]:
                        # Molecule j is equal to a molecule mi
                        # Set the name of the moleculetype to the one of that molecule
                        moleculeTypes[molecules[j]] = moleculeTypes[mol]

        logging.info('Written %d ITP file%s' % (itp, itp > 1 and "s" or ""))

        # WRITING THE MASTER TOPOLOGY
        # Output stream
        top = options["-o"] and open(options['-o'].value, 'w') or sys.stdout

        # ITP file listing
        itps = '\n'.join([
            '#include "%s.itp"' % molecule
            for molecule in set(moleculeTypes.values())
        ])

        # Molecule listing
        logging.info("Output contains %d molecules:" % len(molecules))
        n = 1
        for molecule in molecules:
            chainInfo = (n, moleculeTypes[molecule], len(molecule) > 1 and "s"
                         or " ", " ".join([i.id for i in molecule]))
            logging.info("  %2d->  %s (chain%s %s)" % chainInfo)
            n += 1
        molecules = '\n'.join(
            ['%s \t 1' % moleculeTypes[molecule] for molecule in molecules])

        # Set a define if we are to use rubber bands
        useRubber = options['ElasticNetwork'] and "#define RUBBER_BANDS" or ""

        # XXX Specify a better, version specific base-itp name.
        # Do not set a define for position restrains here, as people are more used to do it in mdp file?
        top.write('''#include "martini.itp"

%s

%s

[ system ]
; name
Martini system from %s

[ molecules ]
; name        number
%s''' % (useRubber, itps, options["-f"] and options["-f"].value
         or "stdin", molecules))

        logging.info('Written topology files')

    # Maybe there are forcefield specific log messages?
    options['ForceField'].messages()

    # The following lines are always printed (if no errors occur).
    print "\n\tThere you are. One MARTINI. Shaken, not stirred.\n"
    Q = DOC.martiniq.pop(random.randint(0, len(DOC.martiniq) - 1))
    print "\n", Q[1], "\n%80s" % ("--" + Q[0]), "\n"
Exemple #11
0
def main(options):
    # Check whether to read from a gro/pdb file or from stdin
    # We use an iterator to wrap around the stream to allow
    # inferring the file type, without consuming lines already
    inStream = IO.streamTag(options["-f"] and options["-f"].value or sys.stdin)

    # The streamTag iterator first yields the file type, which
    # is used to specify the function for reading frames
    fileType = inStream.next()
    if fileType == "GRO":
        frameIterator = IO.groFrameIterator
    else:
        frameIterator = IO.pdbFrameIterator

    # ITERATE OVER FRAMES IN STRUCTURE FILE #

    # Now iterate over the frames in the stream
    # This should become a StructureFile class with a nice .next method
    model     = 1
    cgOutPDB  = None
    ssTotal   = []
    cysteines = []
    for title, atoms, box in frameIterator(inStream):

        if fileType == "PDB":
            # The PDB file can have chains, in which case we list and process them specifically
            # TER statements are also interpreted as chain separators
            # A chain may have breaks in which case the breaking residues are flagged
            chains = [IO.Chain(options, [i for i in IO.residues(chain)]) for chain in IO.pdbChains(atoms)]
        else:
            # The GRO file does not define chains. Here breaks in the backbone are
            # interpreted as chain separators.
            residuelist = [residue for residue in IO.residues(atoms)]
            # The breaks are indices to residues
            broken = IO.breaks(residuelist)
            # Reorder, such that each chain is specified with (i,j,k)
            # where i and j are the start and end of the chain, and
            # k is a chain identifier
            chains = zip([0]+broken, broken+[len(residuelist)], range(len(broken)+1))
            chains = [IO.Chain(options, residuelist[i:j], name=chr(65+k)) for i, j, k in chains]

        for chain in chains:
            chain.multiscale = "all" in options['multi'] or chain.id in options['multi']

        # Check the chain identifiers
        if model == 1 and len(chains) != len(set([i.id for i in chains])):
            # Ending down here means that non-consecutive blocks of atoms in the
            # PDB file have the same chain ID. The warning pertains to PDB files only,
            # since chains from GRO files get a unique chain identifier assigned.
            logging.warning("Several chains have identical chain identifiers in the PDB file.")

        # Check if chains are of mixed type. If so, split them.
        # Note that in some cases HETATM residues are part of a
        # chain. This will get problematic. But we cannot cover
        # all, probably.
        if not options['MixedChains']:
            demixedChains = []
            for chain in chains:
                demixedChains.extend(chain.split())
            chains = demixedChains

        n = 1
        logging.info("Found %d chains:" % len(chains))
        for chain in chains:
            logging.info("  %2d:   %s (%s), %d atoms in %d residues." % (n, chain.id, chain._type, chain.natoms, len(chain)))
            n += 1

        # Check all chains
        keep = []
        for chain in chains:
            if chain.type() == "Water":
                logging.info("Removing %d water molecules (chain %s)." % (len(chain), chain.id))
            elif chain.type() in ("Protein", "Nucleic"):
                keep.append(chain)
            # This is currently not active:
            elif options['RetainHETATM']:
                keep.append(chain)
            else:
                logging.info("Removing HETATM chain %s consisting of %d residues." % (chain.id, len(chain)))
        chains = keep

        # Here we interactively check the charge state of resides
        # Can be easily expanded to residues other than HIS
        for chain in chains:
            for i, resname in enumerate(chain.sequence):
                if resname == 'HIS' and options['chHIS']:
                    choices = {0: 'HIH', 1: 'HIS'}
                    choice = IO.getChargeType(resname, i, choices)
                    chain.sequence[i] = choice

        # Check which chains need merging
        if model == 1:
            order, merge = IO.check_merge(chains, options['mergeList'], options['linkList'], options['CystineCheckBonds'] and options['CystineMaxDist2'])

        # Get the total length of the sequence
        seqlength = sum([len(chain) for chain in chains])
        logging.info('Total size of the system: %s residues.' % seqlength)

        ## SECONDARY STRUCTURE
        ss = ''
        if options['Collagen']:
            for chain in chains:
                chain.set_ss("F")
                ss += chain.ss
        elif options["-ss"]:
            # XXX We need error-catching here,
            # in case the file doesn't excist, or the string contains bogus.
            # If the string given for the sequence consists strictly of upper case letters
            # and does not appear to be a file, assume it is the secondary structure
            ss = options["-ss"].value.replace('~', 'L').replace(' ', 'L')
            if ss.isalnum() and ss.isupper() and not os.path.exists(options["-ss"].value):
                ss = options["-ss"].value
                logging.info('Secondary structure read from command-line:\n'+ss)
            else:
                # There ought to be a file with the name specified
                ssfile = [i.strip() for i in open(options["-ss"].value)]

                # Try to read the file as a Gromacs Secondary Structure Dump
                # Those have an integer as first line
                if ssfile[0].isdigit():
                    logging.info('Will read secondary structure from file (assuming Gromacs ssdump).')
                    ss = "".join([i for i in ssfile[1:]])
                else:
                    # Get the secondary structure type from DSSP output
                    logging.info('Will read secondary structure from file (assuming DSSP output).')
                    pss = re.compile(r"^([ 0-9]{4}[0-9]){2}")
                    ss  = "".join([i[16] for i in open(options["-ss"].value) if re.match(pss, i)])

            # Now set the secondary structure for each of the chains
            sstmp = ss
            for chain in chains:
                ln = min(len(sstmp), len(chain))
                chain.set_ss(sstmp[:ln])
                sstmp = ss[:ln]
        else:
            if options["-dssp"]:
                method, executable = "dssp", options["-dssp"].value
            #elif options["-pymol"]:
            #    method, executable = "pymol", options["-pymol"].value
            else:
                logging.warning("No secondary structure or determination method speficied. Protein chains will be set to 'COIL'.")
                method, executable = None, None

            for chain in chains:
                ss += chain.dss(method, executable)

            # Used to be: if method in ("dssp","pymol"): but pymol is not supported
            if method in ["dssp"]:
                logging.debug('%s determined secondary structure:\n' % method.upper()+ss)

        # Collect the secondary structure classifications for different frames
        ssTotal.append(ss)

        # Write the coarse grained structure if requested
        if options["-x"].value:
            logging.info("Writing coarse grained structure.")
            if cgOutPDB is None:
                cgOutPDB = open(options["-x"].value, "w")
            cgOutPDB.write("MODEL %8d\n" % model)
            cgOutPDB.write(title)
            cgOutPDB.write(IO.pdbBoxString(box))
            atid = 1
            for i in order:
                ci = chains[i]
                if ci.multiscale:
                    for r in ci.residues:
                        for name, resn, resi, chain, x, y, z in r:
                            cgOutPDB.write(IO.pdbOut((name, resn[:3], resi, chain, x, y, z),i=atid))
                            atid += 1
                coarseGrained = ci.cg(com=True)
                if coarseGrained:
                    for name, resn, resi, chain, x, y, z, ssid in coarseGrained:
                        if ci.multiscale:
                            name = "v"+name
                        cgOutPDB.write(IO.pdbOut((name, resn[:3], resi, chain, x, y, z),i=atid,ssid=ssid))
                        atid += 1
                    cgOutPDB.write("TER\n")
                else:
                    logging.warning("No mapping for coarse graining chain %s (%s); chain is skipped." % (ci.id, ci.type()))
            cgOutPDB.write("ENDMDL\n")

        # Gather cysteine sulphur coordinates
        cyslist = [cys["SG"] for chain in chains for cys in chain["CYS"]]
        cysteines.append([cys for cys in cyslist if cys])

        model += 1

    # Write the index file if requested.
    # Mainly of interest for multiscaling.
    # Could be improved by adding separte groups for BB, SC, etc.
    if options["-n"].value:
        logging.info("Writing index file.")
        # Lists for All-atom, Virtual sites and Coarse Grain.
        NAA, NVZ, NCG = [], [], []
        atid = 1
        for i in order:
            ci = chains[i]
            coarseGrained = ci.cg(force=True)
            if ci.multiscale:
                NAA.extend([" %5d" % (a+atid) for a in range(ci.natoms)])
                atid += ci.natoms
            if coarseGrained:
                if ci.multiscale:
                    NVZ.extend([" %5d" % (a+atid) for a in range(len(coarseGrained))])
                else:
                    NCG.extend([" %5d" % (a+atid) for a in range(len(coarseGrained))])
                atid += len(coarseGrained)
        outNDX = open(options["-n"].value, "w")
        outNDX.write("\n[ AA ]\n"+"\n".join([" ".join(NAA[i:i+15]) for i in range(0, len(NAA), 15)]))
        outNDX.write("\n[ VZ ]\n"+"\n".join([" ".join(NVZ[i:i+15]) for i in range(0, len(NVZ), 15)]))
        outNDX.write("\n[ CG ]\n"+"\n".join([" ".join(NCG[i:i+15]) for i in range(0, len(NCG), 15)]))
        outNDX.close()

    # Write the index file for mapping AA trajectory if requested
    if options["-nmap"].value:
        logging.info("Writing trajectory index file.")
        atid = 1
        outNDX = open(options["-nmap"].value, "w")
        # Get all AA atoms as lists of atoms in residues
        # First we skip hetatoms and unknowns then iterate over beads
        # In DNA the O3' atom is mapped together with atoms from the next residue
        # This stores it until we get to the next residue
        o3_shift = ''
        for i_count, i in enumerate(IO.residues(atoms)):
            if i[0][1] in ("SOL", "HOH", "TIP"):
                continue
            if not i[0][1] in MAP.CoarseGrained.mapping.keys():
                continue
            nra = 0
            names = [j[0] for j in i]
            # This gives out a list of atoms in residue, each tuple has other
            # stuff in it that's needed elsewhere so we just take the last
            # element which is the atom index (in that residue)
            for j_count, j in enumerate(MAP.mapIndex(i)):
                outNDX.write('[ Bead %i of residue %i ]\n' % (j_count+1, i_count+1))
                line = ''
                for k in j:
                    if names[k[2]] == "O3'":
                        line += '%s ' % (str(o3_shift))
                        o3_shift = k[2]+atid
                    else:
                        line += '%i ' % (k[2]+atid)
                line += '\n'
                nra += len(j)
                outNDX.write(line)
            atid += nra

    # Evertything below here we only need, if we need to write a Topology
    if options['-o']:

        # Collect the secondary structure stuff and decide what to do with it
        # First rearrange by the residue
        ssTotal = zip(*ssTotal)
        ssAver  = []
        for i in ssTotal:
            si = list(set(i))
            if len(si) == 1:
                # Only one type -- consensus
                ssAver.append(si[0])
            else:
                # Transitions between secondary structure types
                i = list(i)
                si = [(1.0*i.count(j)/len(i), j) for j in si]
                si.sort()
                if si[-1][0] > options["-ssc"].value:
                    ssAver.append(si[-1][1])
                else:
                    ssAver.append(" ")

        ssAver = "".join(ssAver)
        logging.info('(Average) Secondary structure has been determined (see head of .itp-file).')

        # Divide the secondary structure according to the division in chains
        # This will set the secondary structure types to be used for the
        # topology.
        for chain in chains:
            chain.set_ss(ssAver[:len(chain)])
            ssAver = ssAver[len(chain):]

        # Now the chains are complete, each consisting of a residuelist,
        # and a secondary structure designation if the chain is of type 'Protein'.
        # There may be mixed chains, there may be HETATM things.
        # Water has been discarded. Maybe this has to be changed at some point.
        # The order in the coarse grained files matches the order in the set of chains.
        #
        # If there are no merges to be done, i.e. no global Elnedyn network, no
        # disulphide bridges, no links, no distance restraints and no explicit merges,
        # then we can write out the topology, which will match the coarse grained file.
        #
        # If there are merges to be done, the order of things may be changed, in which
        # case the coarse grained structure will not match with the topology...

        # CYSTINE BRIDGES #
        # Extract the cysteine coordinates (for all frames) and the cysteine identifiers
        if options['CystineCheckBonds']:
            logging.info("Checking for cystine bridges, based on sulphur (SG) atoms lying closer than %.4f nm" % math.sqrt(options['CystineMaxDist2']/100))

            cyscoord  = zip(*[[j[4:7] for j in i] for i in cysteines])
            cysteines = [i[:4] for i in cysteines[0]]

            bl, kb    = options['ForceField'].special[(("SC1", "CYS"), ("SC1", "CYS"))]

            # Check the distances and add the cysteines to the link list if the
            # SG atoms have a distance smaller than the cutoff.
            rlc = range(len(cysteines))
            for i in rlc[:-1]:
                for j in rlc[i+1:]:
                    # Checking the minimum distance over all frames
                    # But we could also take the maximum, or the mean
                    d2 = min([FUNC.distance2(a, b) for a, b in zip(cyscoord[i], cyscoord[j])])
                    if d2 <= options['CystineMaxDist2']:
                        a, b = cysteines[i], cysteines[j]
                        options['linkListCG'].append((("SC1", "CYS", a[2], a[3]), ("SC1", "CYS", b[2], b[3]), bl, kb))
                        a, b = (a[0], a[1], a[2]-(32 << 20), a[3]), (b[0], b[1], b[2]-(32 << 20), b[3])
                        logging.info("Detected SS bridge between %s and %s (%f nm)" % (a, b, math.sqrt(d2)/10))

        # REAL ITP STUFF #
        # Check whether we have identical chains, in which case we
        # only write the ITP for one...
        # This means making a distinction between chains and
        # moleculetypes.

        molecules = [tuple([chains[i] for i in j]) for j in merge]

        # At this point we should have a list or dictionary of chains
        # Each chain should be given a unique name, based on the value
        # of options["-o"] combined with the chain identifier and possibly
        # a number if there are chains with identical identifiers.
        # For each chain we then write an ITP file using the name for
        # moleculetype and name + ".itp" for the topology include file.
        # In addition we write a master topology file, using the value of
        # options["-o"], with an added extension ".top" if not given.

        # XXX *NOTE*: This should probably be gathered in a 'Universe' class
        itp = 0
        moleculeTypes = {}
        for mi in range(len(molecules)):
            mol = molecules[mi]
            # Check if the moleculetype is already listed
            # If not, generate the topology from the chain definition
            if mol not in moleculeTypes or options['SeparateTop']:
                # Name of the moleculetype
                # XXX: The naming should be changed; now it becomes Protein_X+Protein_Y+...
                name = "+".join([chain.getname(options['-name'].value) for chain in mol])
                moleculeTypes[mol] = name

                # Write the molecule type topology
                top = TOP.Topology(mol[0], options=options, name=name)
                for m in mol[1:]:
                    top += TOP.Topology(m, options=options)

                # Have to add the connections, like the connecting network
                # Gather coordinates
                mcg, coords = zip(*[(j[:4], j[4:7]) for m in mol for j in m.cg(force=True)])
                mcg         = list(mcg)

                # Run through the link list and add connections (links = cys bridges or hand specified links)
                for atomA, atomB, bondlength, forceconst in options['linkListCG']:
                    if bondlength == -1 and forceconst == -1:
                        bondlength, forceconst = options['ForceField'].special[(atomA[:2], atomB[:2])]
                    # Check whether this link applies to this group
                    atomA = atomA in mcg and mcg.index(atomA)+1
                    atomB = atomB in mcg and mcg.index(atomB)+1
                    if atomA and atomB:
                        cat = (forceconst is None) and "Constraint" or "Link"
                        top.bonds.append(TOP.Bond(
                            (atomA, atomB),
                            options    = options,
                            type       = 1,
                            parameters = (bondlength, forceconst),
                            category   = cat,
                            comments   = "Cys-bonds/special link"))

                # Elastic Network
                # The elastic network is added after the topology is constructed, since that
                # is where the correct atom list with numbering and the full set of
                # coordinates for the merged chains are available.
                if options['ElasticNetwork']:
                    rubberType = options['ForceField'].EBondType
                    rubberList = ELN.rubberBands(
                        [(i[0], j) for i, j in zip(top.atoms, coords) if i[4] in options['ElasticBeads']],
                        options['ElasticLowerBound'], options['ElasticUpperBound'],
                        options['ElasticDecayFactor'], options['ElasticDecayPower'],
                        options['ElasticMaximumForce'], options['ElasticMinimumForce'])
                    top.bonds.extend([TOP.Bond(i, options=options, type=rubberType, category="Rubber band") for i in rubberList])

                # Write out the MoleculeType topology
                destination = options["-o"] and open(moleculeTypes[mol]+".itp", 'w') or sys.stdout
                destination.write(str(top))

                itp += 1

            # Check whether other chains are equal to this one
            # Skip this step if we are to write all chains to separate moleculetypes
            if not options['SeparateTop']:
                for j in range(mi+1, len(molecules)):
                    if not molecules[j] in moleculeTypes and mol == molecules[j]:
                        # Molecule j is equal to a molecule mi
                        # Set the name of the moleculetype to the one of that molecule
                        moleculeTypes[molecules[j]] = moleculeTypes[mol]

        logging.info('Written %d ITP file%s' % (itp, itp > 1 and "s" or ""))

        # WRITING THE MASTER TOPOLOGY
        # Output stream
        top  = options["-o"] and open(options['-o'].value, 'w') or sys.stdout

        # ITP file listing
        itps = '\n'.join(['#include "%s.itp"' % molecule for molecule in set(moleculeTypes.values())])

        # Molecule listing
        logging.info("Output contains %d molecules:" % len(molecules))
        n = 1
        for molecule in molecules:
            chainInfo = (n, moleculeTypes[molecule], len(molecule) > 1 and "s" or " ", " ".join([i.id for i in molecule]))
            logging.info("  %2d->  %s (chain%s %s)" % chainInfo)
            n += 1
        molecules   = '\n'.join(['%s \t 1' % moleculeTypes[molecule] for molecule in molecules])

        # Set a define if we are to use rubber bands
        useRubber   = options['ElasticNetwork'] and "#define RUBBER_BANDS" or ""

        # XXX Specify a better, version specific base-itp name.
        # Do not set a define for position restrains here, as people are more used to do it in mdp file?
        top.write(
'''#include "martini.itp"

%s

%s

[ system ]
; name
Martini system from %s

[ molecules ]
; name        number
%s''' % (useRubber, itps, options["-f"] and options["-f"].value or "stdin", molecules))

        logging.info('Written topology files')

    # Maybe there are forcefield specific log messages?
    options['ForceField'].messages()

    # The following lines are always printed (if no errors occur).
    print "\n\tThere you are. One MARTINI. Shaken, not stirred.\n"
    Q = DOC.martiniq.pop(random.randint(0, len(DOC.martiniq)-1))
    print "\n", Q[1], "\n%80s" % ("--"+Q[0]), "\n"
reg_params = [0.01, 0.05, 0.1, 0.2, 0.5]
ranks = [10, 20]

best_model_rmse,best_model_map = tuning.tune_ALS_NLP(spark, train, val, val_true_list, num_iters, reg_params, ranks, review_val_predictions)


# test performance
test_predictions = best_model_rmse_lr.transform(test_review_feature)
review_test_predictions = test_predictions.withColumn('prediction', when(test_predictions['prediction'] < 0, 0).otherwise(test_predictions['prediction']))
review_test_predictions = review_test_predictions.withColumnRenamed('prediction','review_prediction')

test_predictions = best_model_rmse.transform(test)
als_test_predictions = test_predictions.withColumnRenamed('prediction','als_prediction')

total_predictions = als_test_predictions.join(review_test_predictions,['user_id','book_id','rating'],'outer')
total_predictions = total_predictions.withColumn('total_prediction', when(total_predictions['review_prediction'].isNotNull(), total_predictions['review_prediction']).otherwise(total_predictions['als_prediction']))
window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc())
top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num')<=500)

evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction')
rmse_test = evaluator.evaluate(top_predictions)

window = Window.partitionBy(test['user_id']).orderBy(test['rating'].desc())
test_true_list = test.select('*', rank().over(window).alias('true_row'))
map_score = MAP.getMAP(top_predictions, test_true_list)
print('Test set RMSE = {}, Test set MAP = {}'.format(rmse_test, map_score))




Exemple #13
0
def tune_ALS_NLP(spark, train_data, validation_data, val_true_list, maxIter, regParams, ranks, review_val_predictions):
    # initial
    min_error = float('inf')
    best_iter1 = -1
    best_rank1 = -1
    best_regularization1 = 0
    best_model_rmse = None
    max_map = 0.0
    best_iter2 = -1
    best_rank2 = -1
    best_regularization2 = 0
    best_model_map = None

    for iteration in maxIter:
        for current_rank in ranks:
            for reg in regParams:
                als=ALS(maxIter=iteration,regParam=reg,rank=current_rank, \
                        userCol='user_id',itemCol='book_id',ratingCol='rating', \
                        coldStartStrategy="drop",nonnegative=True)
                als_model = als.fit(train_data)
                predictions = als_model.transform(validation_data)
                
                review_predictions = review_val_predictions.withColumnRenamed('prediction','review_prediction')
                als_predictions = predictions.withColumnRenamed('prediction','als_prediction')
                total_predictions = als_predictions.join(review_predictions,['user_id','book_id','rating'],'outer')
                total_predictions = total_predictions.withColumn('total_prediction', \
                                                                 when(total_predictions['review_prediction'].isNotNull(), \
                                                                      total_predictions['review_prediction']) \
                                                                 .otherwise(total_predictions['als_prediction']))
                              
                window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc())
                top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 500)

                # rmse
                evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction')
                rmse = evaluator.evaluate(top_predictions)
                if rmse < min_error:
                    min_error = rmse
                    best_rank1 = current_rank
                    best_regularization1 = reg
                    best_iter1 = iteration
                    best_model_rmse = als_model

                # MAP
                current_map = MAP.getMAP(top_predictions, val_true_list)
                if current_map > max_map:
                    max_map = current_map
                    best_rank2 = current_rank
                    best_regularization2 = reg
                    best_iter2 = iteration
                    best_model_map = als_model

                print('{} latent factors and regularization = {} with maxIter {}: '
                  'validation RMSE is {}' 'validation MAP is {}' .format(current_rank, reg, iteration, rmse, current_map))
              
                with open('train05_review_eval.csv', 'ab') as f:
                    np.savetxt(f, [np.array([iteration, current_rank, reg, rmse, current_map])],delimiter=",")

    print('\nThe best model select by RMSE has {} latent factors and '
          'regularization = {}'' with maxIter = {}: RMSE = {}'.format(best_rank1, best_regularization1, best_iter1, min_error))
    print('\nThe best model select by MAP has {} latent factors and '
          'regularization = {}'' with maxIter = {}: MAP = {}'.format(best_rank2, best_regularization2, best_iter2, max_map))

    return best_model_rmse,best_model_map