def bestFirstSearch(self, startNode, goalNode, heuristic):
        	bestFitstQueue = PriorityQueue(0)
        
	        visitedNode = {}
	        parent = {}
	        utility = Utility()
	        goalNodeStateDict = utility.getGoalNodeStateDict(goalNode)
	        bestFitstQueue.put((1, startNode))
	        while not(bestFitstQueue.empty()):
	            poppedTuple = bestFitstQueue.get()
	            self.nodesVisited+=1
	            popped = poppedTuple[1]
	            if (popped == goalNode):
	#                 print popped==goalNode
	                return self.backtrack(popped)
	            
	            else:
	                visitedNode[popped.stringRep]=popped
	                if self.maxDepth < popped.level:
	                    self.maxDepth = popped.level
	                popped.expand()
	                
	                for child in popped.children:
	                    if child.stringRep not in visitedNode:
	                     heuristicOfChild = utility.getHeuristic(child, goalNode, heuristic, goalNodeStateDict)
	                     bestFitstQueue.put((heuristicOfChild, child))
	                     self.maxQueueSize +=1
Exemple #2
0
def proof_of_work(a, last_block, data):
    func = inspect.currentframe().f_back.f_code
    logging.info("Starting proof of work")
    start = time.time()
    interval = 20
    now = time.time() + 1
    effort, pow_hash_object = Utility.genhash(last_block.index + 1, time.time(), data, last_block.hash)
    leading_zeroes = Utility.leadingzeroes(pow_hash_object.digest())
    while leading_zeroes <= variables.WORK:
        now = time.time() + 1
        if int(now - start) % interval + 1 == 0:
            logging.debug("Checking for messages")
            messages = []
            while not a.empty():
                obj = a.get()
                logging.debug("Got {} from queue".format(obj))
                messages.append(obj)
            for message in messages:
                if message[0] == "ip":
                    logging.debug("That's an ip {} adding to peers".format(message[1]))
                    variables.PEER_NODES.append(str(messages[1]))
                    continue
                logging.debug("not an IP, putting it back message:{}".format(message))
                a.put(message)
            start = time.time()
            consensus = consensus()

            if consensus:
                logging.info("Received a consensus while doing POW")
                return False, consensus
        effort, pow_hash_object = Utility.genhash(last_block.index + 1, now, data, last_block.hash)
        leading_zeroes = Utility.leadingzeroes(pow_hash_object.digest())
    retBlock = Block(last_block.index + 1, now, pow_hash_object.hexdigest(), effort, data, last_block.hash)
    logging.info("Farmed a block returning: {}".format(retBlock))
    return True, retBlock
Exemple #3
0
def start():
    global node, blockchain
    genesis = Utility.create_genesis_block()
    blockchain = Blockchain(genesis.index, genesis.timemade,
                            genesis.proof_of_work, genesis.effort,
                            genesis.transactions, genesis.previous_hash)
    node.config['SECRET_KEY'] = Utility.createHexdigest(User.password)
    node.run(host="0.0.0.0", port=variables.PORT)
    def write_value(cls, io, value, master, device, step=1):
        try:
            address = io.address
            byte_order = device.byte_order
            if 'int' in io.type or 'dword' in io.type or 'float' in io.type:
                (cmd, mb_addr) = cls().__transfer_write_addr(
                    int(address) + (step * 2), io.type)
            else:
                (cmd, mb_addr) = cls().__transfer_write_addr(
                    int(address) + step, io.type)

            if 'int' in io.type:
                value1 = int(value)
                values = Utility.toWords(value1, byte_order)
            elif 'dword' in io.type:
                value1 = int(value)
                if ("unsigned" in io.type and 0 <= value1 <= 4294967295) or \
                        ("unsigned" not in io.type and -2147483648 < value1 < 2147483648):
                    values = Utility.toWords(value1, byte_order)
                else:
                    raise ValueError("Invalid values: %s" % value1)
            elif io.type == 'float':
                value1 = float(value)
                values = Utility.Float2ieee754Words(value1, byte_order)
            elif 'word' in io.type:
                value1 = int(value)
                if ("unsigned" in io.type and 0 <= value1 <= 65535) or \
                        ("unsigned" not in io.type and -32768 <= value1 <= 32767):
                    if re.search(r'(^ba($|dc$))|(^dcba$)',
                                 byte_order) is not None:
                        h = ((value1 & 0xff00) >> 8)
                        l = (value1 & 0xff)
                        value1 = ((l << 8) & 0xff00) | h
                    if cmd == cst.WRITE_SINGLE_COIL:
                        values = value1
                    else:
                        values = [
                            value1,
                        ]
                else:
                    raise ValueError("Invalid values: %s" % value1)
            else:
                if cmd == cst.WRITE_SINGLE_COIL:
                    values = value
                else:
                    values = [
                        value,
                    ]
            logger.debug(
                "write ctrl [modbus] id: %s, cmd: %s, mb_addr: %s, value: %s" %
                (device.id, cmd, mb_addr, values))
            master.execute(device.machine_address,
                           cmd,
                           mb_addr,
                           output_value=values)
        except Exception as e:
            logger.debug("Write error [modbus] @ %s : %s" % (io.mb_id, e))
Exemple #5
0
def start(e):
    global node, blockchain,mining_process,event
    if not len(Variables.PEER_NODES) > 0:
        genesis = Utility.create_genesis_block()
        blockchain = Blockchain(genesis)
    else:
        consensus()

    event = e
    mining_process = Process(target=Mining.mine, args=(event,))
    mining_process.start()
    logging.debug("Mining_classes Started")
    node.config['SECRET_KEY'] = Utility.createHexdigest(User.password)
    node.run(host="0.0.0.0", port=Variables.PORT)
Exemple #6
0
def getFeatureRMSEAgainstBaseline(cols=['color_exist']):
    utility = Utility()
    utility.startTimeTrack()
    # This part skips the feature training and simply use it.
    print("len(cols):", len(cols), cols)
    print("Reading feature set")
    all_df = pd.read_csv('../data/features_doc2vec_sense2vec_pmi_20170418.csv')
    feature_train_df = all_df[:74067]
    # Must drop these columns for OrdinalRegression
    feature_train_df.drop('wm_product_brand', axis=1, inplace=True)

    cols.append('relevance_int')
    cols.append('id')
    cols.append('search_term')
    cols.append('product_uid')
    cols.append('relevance')
    cols.append('product_idx')
    cols.append('Word2VecQueryExpansion')

    print(cols)
    feature_train_df = feature_train_df.filter(items=cols, axis=1)

    feature_test_df = all_df[74067:]
    feature_test_df.drop('relevance', axis=1, inplace=True)
    utility.checkpointTimeTrack()

    print("####  Running: OrdinalRegression ordridge training ####")
    # dp=DataPreprocessing()
    print("feature_train_df:", list(feature_train_df))
    # trainDF,validateDF=dp.generateValidationSet(train_df)
    orModel = OrdinalRegressionRanker('ordridge')
    orModel.train(feature_train_df, None)
    # orModel.gridSearch(feature_train_df, None)
    print("####  Completed: OrdinalRegression ordridge training ####")
    utility.checkpointTimeTrack()
Exemple #7
0
def mine(a):
    func = inspect.currentframe().f_back.f_code
    logging.info("Starting to mine")
    # See if other blockchains exist
    blockchain = consensus()
    if not blockchain:
        logging.info("Didn't receive a blockchain from anyone and need to make one")
        # We didn't find one, need to make one
        variables.BLOCKCHAIN.append(Utility.create_genesis_block())
    else:
        logging.info("Received a blockchain from the net")
        # See if we got any blocks from someone, save it
        variables.BLOCKCHAIN = blockchain
    message = Utility.buildmessage("blockchain", variables.BLOCKCHAIN)
    logging.debug("Adding {} to queue".format(message))
    a.put(message)
    url = "http://" + variables.MINER_NODE_URL + ":" + str(variables.PORT) + "/blocks?update=" + User.public_key
    logging.debug("accessing url via GET")
    requests.get(url)
    logging.debug("Done accessing url")
    while True:
        last_block = variables.BLOCKCHAIN[len(variables.BLOCKCHAIN) - 1]
        #   -get transactions
        url = "http://" + variables.MINER_NODE_URL + ":" + str(variables.PORT) + "/txion?update=" + User.public_key
        logging.debug("Getting transactions from {}".format(url))
        transactions = requests.get(url).content
        logging.debug("Done getting transactions")
        variables.PENDING_TRANSACTIONS = json.loads(transactions)
        logging.warning("type of transaction: {}".format(type(variables.PENDING_TRANSACTIONS)))
        variables.PENDING_TRANSACTIONS.append({
            "from": "network",
            "to": User.public_key,
            "amount": 1.0})
        # mine using the updated blockchain
        pow, pow_output = proof_of_work(a, last_block, variables.PENDING_TRANSACTIONS)
        variables.PENDING_TRANSACTIONS = []
        if pow:
            logging.info("Mined a block {}".format(pow_output))
            variables.BLOCKCHAIN.append(pow_output)
        else:
            logging.info("Consensus returned a blockchain {}".format(pow_output))
            variables.BLOCKCHAIN = pow_output
        logging.debug("Adding that blockchain to the Queue")
        a.put(["mine", variables.BLOCKCHAIN])
        url = "http://" + variables.MINER_NODE_URL + ":" + str(variables.PORT) + "/blocks?update=" + User.public_key
        logging.debug("accessing url via GET")
        requests.get(url)
        logging.debug("Done accessing url")
Exemple #8
0
def proof_of_work(last_block, data):
    index_to_use = last_block.index + 1
    func = inspect.currentframe().f_back.f_code
    logging.info("Starting proof of work")
    done = False
    while not done:
        now = time.time()
        effort, pow_hash_object = Utility.genhash(index_to_use, now, data,
                                                  last_block.hash)
        leading_zeroes = Utility.leadingzeroes(pow_hash_object.digest())
        if leading_zeroes >= variables.WORK:
            done = True
    retBlock = Block(index_to_use, now, pow_hash_object.hexdigest(), effort,
                     data, last_block.hash)
    logging.info("Farmed a block returning: {}".format(retBlock))
    return retBlock
Exemple #9
0
def find_new_chains():
    func = inspect.currentframe().f_back.f_code
    logging.info("Starting to find new chains")
    # Get the blockchains of every other node
    peers = variables.PEER_NODES
    logging.debug("peers: {}".format(len(peers)))
    other_chains = []
    for node_url in peers:

        blockchain_json = None
        found_blockchain = []

        url = "http://" + node_url + ":" + str(variables.PORT) + "/blocks"
        try:
            logging.debug("Attempting to access {}".format(node_url))
            blockchain_json = requests.post(url)
        except:
            logging.warning("Failed to access {}, removing from peers".format(node_url))
            variables.PEER_NODES.remove(node_url)
            continue
        # Convert the JSON object to a Python dictionary
        if blockchain_json is not None:
            blockchain_json = json.loads(blockchain_json.content)
            for block_json in blockchain_json:
                temp = Block()
                temp.importjson(block_json)
                if Utility.validate(temp):
                    logging.debug("Block validated, adding")
                    found_blockchain.append(temp)
                else:
                    logging.warning("Block NOT valid, next peer")
                    continue
            # Verify other node block is correct
            logging.debug("Attempting to validate this: {}".format(found_blockchain))
            validated = Utility.validate_blockchain(found_blockchain)
            if validated:
                logging.debug("Blockchain did validate")
                other_chains.append(found_blockchain)
            else:
                logging.warning("Blockchain did not validate")
            continue
    logging.info("Ending find new chains")
    return other_chains
Exemple #10
0
def proof_of_work(last_block, data):
    global event
    func = inspect.currentframe().f_back.f_code
    done = False
    now = None
    pow_hash_object = None
    effort = None
    while not done:
        if event.is_set():
            print("Exiting block creation")
            return False
        now = time.time()
        index_to_use = last_block.index + 1
        effort, pow_hash_object = Utility.genhash(index_to_use, now, data,
                                                  last_block.hash)
        leading_zeroes = Utility.leadingzeroes(pow_hash_object.digest())
        if leading_zeroes >= Variables.WORK:
            done = True
    return_block = Block(index_to_use, now, pow_hash_object.hexdigest(),
                         effort, data, last_block.hash)
    return return_block
 def generator_fix_length_vars(self, values, name, timestamp, length=200):
     vals = []
     pkg_length = 0
     for group in self.groups:
         if name != group.name:
             continue
         for var in group.vars:
             key = var.id if var.calc_mode == 'instant' else "%s.%s" % (
                 group.name, var.id)
             if key in values.keys():
                 val = dict()
                 val['value'] = values[key]
                 val['id'] = var.id
                 val['timestamp'] = Utility.getTimeStr(timestamp)
                 val['endTime'] = Utility.getTimeStr(timestamp)
                 vals.append(val)
                 pkg_length += 1
             if pkg_length >= length:
                 pkg_length = 0
                 yield vals, group.name
                 vals = self.clear_list(vals)
         yield vals, group.name
         pkg_length = 0
         vals = self.clear_list(vals)
Exemple #12
0
def block():

    global blockchain
    ip = request.remote_addr
    if ip != "127.0.0.1" and ip not in Variables.PEER_NODES:
        Variables.PEER_NODES.append(ip)
    if ip == '127.0.0.1':
        raw = request.data.decode('utf-8')
        parsed = xmltodict.parse(raw)
        b = Block()
        b.import_from_xml(parsed['block'])
        print("I made", b)
        blockchain.add(b)

        #Distribute the block to our peers

        for peer in Variables.PEER_NODES:
            try:
                url = "http://" + peer + ":" + str(Variables.PORT) + "/block"
                xml = b.export_to_xml()
                headers = {'Content-Type': 'application/xml'}
                resp = requests.post(url, data=xml, headers=headers).text
            except:
                Variables.PEER_NODES.remove(peer)
    else:
        block_number = None

        try:
            block_number = int(request.args['block_number'])
        except:
            pass
        if block_number is not None:
            return blockchain.get(block_number).export_to_xml()
        else:
            raw = request.data.decode('utf-8')
            parsed = xmltodict.parse(raw)
            b = Block()
            b.import_from_xml(parsed['block'])
            print("receiv", b)
            if Utility.validate(b):
                blockchain.add(b)
                global event
                event.set()
            else:
                print("Block did not validate",ip)
    return "0"
Exemple #13
0
    def scrapPage(self, url, verbose=False):

        if verbose:
            print(url)

        soup = self.loadSoup(url)
        soup = self.sanitize(soup)
        #split strings by spaces and newlines
        rawStrings = self.splitStrings(soup)
        #apply filters
        processedStrings = self.preprocess(rawStrings, removeLongEntries=True)
        #get pairs (word, count)
        pairs = Utility.countPairs(processedStrings)
        if verbose:
            print('|')

        imageSoup = self.loadSoup(url, 'lxml')
        return pairs, self.getImagesCount(imageSoup)
Exemple #14
0
def get_blocks():
    func = inspect.currentframe().f_back.f_code
    ip = request.remote_addr
    logging.info("/blocks accessed from {} via {}".format(ip, request.method))
    if request.method == 'POST':
        if str(ip) != "127.0.0.1" and ip not in variables.PEER_NODES:
            logging.debug("We didn't know that IP, adding it to Q")
            message = Utility.buildmessage("ip", ip)
            logging.debug("message: {}".format(message))
            q.put(message)
    # Load current blockchain. Only you should update your blockchain
    qfrom = "other"
    if request.args.get("update") == User.public_key:
        logging.debug("update was from our public, we updated our blockchain")
        qget = q.get()
        logging.debug("qget is {}".format(qget))
        qfrom = qget[0]
        variables.BLOCKCHAIN = qget[1]
        logging.info("Done updating our blockchain")

        return "200"
    else:
        chain_to_send = variables.BLOCKCHAIN
        logging.debug("Chain to send:{}".format(chain_to_send))
        logging.debug(
            "request was not from us, we need to give them our blockchain")
        # Converts our blocks into dictionaries so we can send them as json objects later
        chain_to_send_json = []
        for block in chain_to_send:
            logging.debug("block to send TYPE:{} details:{}".format(
                type(block), block))
            try:
                chain_to_send_json.append(block.exportjson())
            except AttributeError:
                logging.error("This is not a block {}".format(block))

        # Send our chain to whomever requested it
        chain_to_send = json.dumps(chain_to_send_json)
        logging.debug("Sending {}".format(chain_to_send))
        logging.info("Done sending out our blockchain")
        return chain_to_send
Exemple #15
0
def transaction():
    func = inspect.currentframe().f_back.f_code
    #TODO add logging to transactions, currently we can't send and receive blocks. One problem at a time.
    if request.method == 'POST':
        # On each new POST request, we extract the transaction data
        new_txion = request.get_json()
        # Then we add the transaction to our list
        if Utility.validate_signature(new_txion['from'],
                                      new_txion['signature'],
                                      new_txion['message']):
            variables.PENDING_TRANSACTIONS.append(new_txion)
            # Because the transaction was successfully
            # submitted, we log it to our console
            print("New transaction")
            print("FROM: {0}".format(new_txion['from']))
            print("TO: {0}".format(new_txion['to']))
            print("AMOUNT: {0}\n".format(new_txion['amount']))
            # Then we let the client know it worked out

            # Push to all other available nodes
            for node_url in variables.PEER_NODES:
                if node_url != request.remote_addr:
                    try:
                        headers = {"Content-Type": "application/json"}
                        requests.post(node_url + ":" + str(User.PORT) +
                                      "/txion",
                                      json=new_txion,
                                      headers=headers)
                    except:
                        pass
            return "Transaction submission successful\n"
        else:
            return "Transaction submission failed. Wrong signature\n"
    # Send pending transactions to the mining process
    elif request.method == 'GET' and request.args.get(
            "update") == User.public_key:
        pending = json.dumps(variables.PENDING_TRANSACTIONS)
        # Empty transaction list
        variables.PENDING_TRANSACTIONS = []
        return pending
Exemple #16
0
def consensus():
    func = inspect.currentframe().f_back.f_code
    logging.info("Starting Consensus")
    peers = variables.PEER_NODES
    logging.debug("Peers: {}".format(peers))
    if len(peers) == 0:
        logging.info("Ending consensus, we have no peers")
        return False

    # Get the blocks from other nodes
    other_chains = find_new_chains()
    if len(other_chains) == 0:
        logging.debug("no chains found")
        logging.info("Ending consensus, no other chains found")
        return False
    if type(other_chains[0]) != type([]):
        if len(other_chains) < len(variables.BLOCKCHAIN):
            logging.debug("Our blockchain is bigger")
            logging.info("Ending consensus, rejecting others")
            return False
        else:
            logging.info("Ending consensus, we have one peer with a longer blockchain")

            return other_chains
    # If our chain isn't longest, then we store the longest chain
    longest = 0
    max_length = 0
    for i in range(len(other_chains)):
        if Utility.validate_blockchain(other_chains[i]):
            chain_length = len(other_chains[i])
            if chain_length > max_length:
                max_length = chain_length
                longest = i
    if len(other_chains[longest]) == len(variables.BLOCKCHAIN):
        logging.debug("Our blockchain is the same size1")
        logging.info("Ending consensus, rejecting others")
        return False
    logging.info("Ending Consensus with a chain")
    logging.debug("Consensus returned: {}".format(other_chains[longest]))
    return other_chains[longest]
Exemple #17
0
    def getFeature(self, train_query_df, product_df, attribute_df, test_query_df,
                   features="brand,attribute,spelling,nonascii,stopwords,colorExist,color_onehot,brandExist,wmdistance,stemming,word2vec,Word2VecQueryExpansion,tfidf,tfidf_expandedquery,doc2vec,doc2vec_expandedquery,bm25,bm25expandedquery,doclength"):
        ## Please feel free to add feature into this method.
        ## For testing, you may want to comment out some feature generation to save time
        ## as some takes a long time to run.

        timetracker=Utility()
        if features.find("brand") != -1:
            # Create Brand Column
            product_df = self.__createBrandColumn(product_df, attribute_df)

        if features.find("attribute") != -1:
            # Create Attribute column as a JSON string
            # Column name is attr_json
            product_df = self.__createAttributeColumn(product_df, attribute_df)

        if features.find("spelling") != -1:
            # Perform spell correction on search_term
            print("Performing spell correction")
            spell_dict = Feature_Spelling.getSpellingCorrectionDict()
            # print(self.__spell_correction('lifeswivel', spell_dict))
            train_query_df['search_term'] = train_query_df['search_term'].map(
                lambda x: self.__spell_correction(x, spell_dict))
            product_df['product_description'] = product_df['product_description'].map(
                lambda x: self.__spell_correction(x, spell_dict))
            product_df['product_title'] = product_df['product_title'].map(
                lambda x: self.__spell_correction(x, spell_dict))
            product_df['attr_json'] = product_df['attr_json'].map(
                lambda x: self.__spell_correction(str(x), spell_dict))

        if features.find("nonascii") != -1:
            # Remove non-ascii characters
            print("Performing non-ascii removal")
            start_time = time.time()
            train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__nonascii_clean((x)))
            print("Non-ascii clean on search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_title'] = product_df['product_title'].map(lambda x: self.__nonascii_clean(str(x)))
            print("Non-ascii clean on product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        # Run this to download the download the stopword list if you hit error
        # nltk.download()

        if features.find("stopwords") != -1:
            # Stopwords removal
            print("Performing stopwords removal")
            start_time = time.time()
            train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__stopword_removal((x)))
            print("stopwords removal on search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_title'] = product_df['product_title'].map(lambda x: self.__stopword_removal(str(x)))
            print("stopwords removal on product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_description'] = product_df['product_description'].map(lambda x: self.__stopword_removal(str(x)))
            print("stopwords removal on product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['attr_json'] = product_df['attr_json'].map(lambda x: self.__stopword_removal(str(x)))
            print("stopwords removal on attr_jason took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("colorExist") != -1:
            # Check if color in search_term exist in product_description column
            print("Performing color and material check")
            start_time = time.time()
            color = Feature_ColorMaterial()
            train_query_df['color'] = color.checkColorMaterialExists(train_query_df, product_df)
            train_query_df['color_exist'] = train_query_df['color'].map(lambda x: 1 if len(x)>0 else 0)
            # Save some memory. Change it to uint8
            train_query_df.color_exist = train_query_df.color_exist.astype(np.uint8)

            if features.find("color_onehot") != -1:
                train_query_df = self.__onehot_color(train_query_df)

            # Clean up unused column
            train_query_df.pop('color')
            print("Color and material check took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("brandExist") != -1:
            # Check if brand in search term exist product_brand column
            print("Performing brand check")
            start_time = time.time()

            train_query_df['brand_exist'] = self.__brandExist(train_query_df, product_df)
            # train_query_df['brand_exist'] = train_query_df['search_term'].map(lambda x: 1 if len(x)>0 else 0)
            print("Brand check took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find('wmdistance') != -1:
            print("Performing Word Mover Distance")
            start_time = time.time()

            wm = Feature_WordMoverDistance()
            train_query_df['wm_product_description'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'product_description')
            print("WMDistance for product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            train_query_df['wm_product_title'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'product_title')
            print("WMDistance for product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            train_query_df['wm_product_brand'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'product_brand')
            print("WMDistance for product_brand took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            train_query_df['wm_attr_json'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'attr_json')
            print("WMDistance for attr_json took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("stemming") != -1:
            # # Stemming
            print("Performing Stemming")
            start_time = time.time()
            train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__stemming((x)))
            print("Stemming search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_title'] = product_df['product_title'].map(lambda x: self.__stemming(str(x)))
            print("Stemming product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_brand'] = product_df['product_brand'].map(lambda x: self.__stemming(str(x)))
            print("Stemming product_brand took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_description'] = product_df['product_description'].map(lambda x: self.__stemming(str(x)))
            print("Stemming product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['attr_json'] = product_df['attr_json'].map(lambda x: self.__stemming(str(x)))
            print("Stemming attr_json took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("word2vec") != -1:
            # Word2Vec
            print("===========Performing word2vec computation....this may take a while")
            timetracker.startTimeTrack()
            print("Merging product_title and description")
            print(list(product_df))
            product_df['content'] = product_df['product_title'].map(str) + " " + \
                                    product_df['product_description'].map(str) + " " + \
                                    product_df['product_brand'].map(str)
            timetracker.checkpointTimeTrack()
            print("Adding training query for that product id into the content")
            product_df = product_df.reset_index(drop=True)
            counter = 0
            for index, product in product_df.iterrows():
                # print("product:", product)
                productId = product['product_uid']
                # print("productId:",productId)
                df = train_query_df[train_query_df.product_uid == productId]
                # print("df:",df)
                searchterms = ""
                for index, row in df.iterrows():
                    searchterm = row['search_term']
                    searchterms = searchterms + " " + searchterm

                newString = product_df.iloc[counter]['content'] + " " + searchterms
                product_df.set_value(counter, 'content', newString)

                counter = counter + 1

            timetracker.checkpointTimeTrack()

            w2v = Feature_Word2Vec.Feature_Word2Vec()
            print("Convert DF into sentences for word2vec processing")
            sentences = w2v.convertDFIntoSentences(product_df, 'content')
            timetracker.checkpointTimeTrack()
            print("Training word2vec")
            w2v.trainModel(sentences)
            timetracker.checkpointTimeTrack()
            print("Validating...this should give some results like sofa")
            print(w2v.getVectorFromWord('stool'))
            print(w2v.getSimilarWordVectors('stool', 5))
            print("===========Completed word2vec computation")

        ##WARNING: This has to be before bm25expandedquery function call
        if features.find("Word2VecQueryExpansion") != -1:
            # Word2VecQueryExpansion
            print("===========Performing Word2VecQueryExpansion computation....this may take a super long time")
            timetracker.startTimeTrack()
            # print("Merging product_title and description")
            # print(list(product_df))
            # product_df['content']=product_df['product_title'].map(str) +" "+ \
            #                       product_df['product_description'].map(str) + " " + \
            #                       product_df['product_brand'].map(str)
            # product_df.head(1)
            print("Compute Word2VecQueryExpansion")
            w2cExpand = Word2VecQueryExpansion()
            timetracker.checkpointTimeTrack()
            # print("Remove merged column")
            # product_df=product_df.drop('content', axis=1)
            # For every training query-document pair, generate bm25
            print("Generate Word2VecQueryExpansion column")
            train_query_df = w2cExpand.computeExpandedQueryColumn(trainset=train_query_df,
                                                                  colName='Word2VecQueryExpansion')
            timetracker.checkpointTimeTrack()
            print("train_query_df:", list(train_query_df))
            print("train_query_df head:", train_query_df.head(1))
            print("Saving to csv")
            train_query_df.to_csv('../data.prune/train_query_with_Word2VecQueryExpansion.csv')
            timetracker.checkpointTimeTrack()
            print("===========Completed Word2VecQueryExpansion computation")

        if features.find("tfidf") != -1:
            # TF-IDF
            print("Performing TF-IDF")
            tfidf = Feature_TFIDF()
            train_query_df['tfidf_product_title'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_title')
            train_query_df['tfidf_product_brand'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_brand')
            train_query_df['tfidf_product_description'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_description')
            train_query_df['tfidf_attr_json'] = tfidf.getCosineSimilarity(train_query_df, 'search_term',
                                                                                    product_df,
                                                                                    'attr_json')
        if features.find("tfidf_expandedquery") != -1:
            # TF-IDF on expanded query
            print("Performing TF-IDF with expanded query")
            tfidf = Feature_TFIDF()
            train_query_df['tfidf_expanded_product_title'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df,
                                                                              'product_title')
            train_query_df['tfidf_expanded_product_brand'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df,
                                                                              'product_brand')
            train_query_df['tfidf_expanded_product_description'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df,
                                                                              'product_description')
            train_query_df['tfidf_expanded_attr_json'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion',
                                                                                    product_df,
                                                                                    'attr_json')

        if features.find("doc2vec") != -1:
            # Doc2Vec
            print("Performing Doc2Vec")
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_product_title'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_title')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_product_brand'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_brand')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_product_description'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_description')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_attr_json'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term',
                                                                                        product_df,
                                                                                        'attr_json')

        if features.find("doc2vec_expandedquery") != -1:
            # Doc2Vec
            print("Performing Doc2Vec with expanded query")
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_product_title'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                                  'Word2VecQueryExpansion',
                                                                                  product_df,
                                                                                  'product_title')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_product_brand'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                                  'Word2VecQueryExpansion',
                                                                                  product_df,
                                                                                  'product_brand')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_product_description'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                                        'Word2VecQueryExpansion',
                                                                                        product_df,
                                                                                        'product_description')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_attr_json'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                              'Word2VecQueryExpansion',
                                                                              product_df,
                                                                              'attr_json')

        if features.find("bm25") != -1:
            # BM25
            print("===========Performing BM25 computation....this may take a while")
            timetracker.startTimeTrack()
            print("Merging product_title and description")
            print(list(product_df))
            product_df['content']=product_df['product_title'].map(str) +" "+ \
                                  product_df['product_description'].map(str) + " " + \
                                  product_df['product_brand'].map(str)
            timetracker.checkpointTimeTrack()

            print("Adding training query for that product id into the content")
            product_df=product_df.reset_index(drop=True)
            counter=0
            for index,product in product_df.iterrows():
                # print("product:", product)
                productId=product['product_uid']
                # print("productId:",productId)
                df=train_query_df[train_query_df.product_uid==productId]
                # print("df:",df)
                searchterms=""
                for index,row in df.iterrows():
                    searchterm=row['search_term']
                    searchterms=searchterms+" "+searchterm

                newString=product_df.iloc[counter]['content']+" "+searchterms
                product_df.set_value(counter,'content',newString)

                counter=counter+1

            timetracker.checkpointTimeTrack()

            print("Compute BM25")
            bm25 = Feature_BM25(product_df)
            timetracker.checkpointTimeTrack()
            print("Remove merged column")
            product_df=product_df.drop('content', axis=1)
            #For every training query-document pair, generate bm25
            print("Generate bm25 column")
            train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25', searchTermColname='search_term')
            timetracker.checkpointTimeTrack()
            print("train_query_df:",list(train_query_df))
            print("train_query_df head:",train_query_df.head(1))
            print("Saving to csv")
            train_query_df.to_csv('../data.prune/train_query_with_bm25_search_term.csv')
            timetracker.checkpointTimeTrack()
            print("===========Completed BM25 computation")

        if features.find("bm25expandedquery") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing BM25expanded computation....this may take a while")
                timetracker.startTimeTrack()
                print("Merging product_title and description")
                print(list(product_df))
                product_df['content']=product_df['product_title'].map(str) +" "+ \
                                      product_df['product_description'].map(str) + " " + \
                                      product_df['product_brand'].map(str)
                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25expandedquery', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed BM25expanded computation")
            else:
                print("ERROR: Cannot proceed with bm25expandedquery. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")


        if features.find("bm25description") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing bm25description computation....this may take a while")
                timetracker.startTimeTrack()
                print(list(product_df))
                # product_df['content']=product_df['product_title'].map(str) +" "+ \
                #                       product_df['product_description'].map(str) + " " + \
                #                       product_df['product_brand'].map(str)
                product_df['content']=product_df['product_description'].map(str)

                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25description', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed bm25description computation")
            else:
                print("ERROR: Cannot proceed with bm25description. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")


        if features.find("bm25title") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing bm25title computation....this may take a while")
                timetracker.startTimeTrack()
                print(list(product_df))
                # product_df['content']=product_df['product_title'].map(str) +" "+ \
                #                       product_df['product_description'].map(str) + " " + \
                #                       product_df['product_brand'].map(str)
                product_df['content']=product_df['product_title'].map(str)

                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25title', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed bm25title computation")
            else:
                print("ERROR: Cannot proceed with bm25title. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")


        if features.find("bm25brand") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing bm25brand computation....this may take a while")
                timetracker.startTimeTrack()
                print(list(product_df))
                # product_df['content']=product_df['product_title'].map(str) +" "+ \
                #                       product_df['product_description'].map(str) + " " + \
                #                       product_df['product_brand'].map(str)
                product_df['content']=product_df['product_brand'].map(str)

                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25brand', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed bm25brand computation")
            else:
                print("ERROR: Cannot proceed with bm25brand. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")



        if features.find("doclength") != -1:
            # Document Length
            print("Performing Document Length")
            product_df['len_product_title'] = product_df['product_title'].map(lambda x: len(homedepotTokeniser(x)))
            train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_product_title']], how='left',
                                      on='product_uid')
            product_df['len_product_description'] = product_df['product_description'].map(lambda x: len(homedepotTokeniser(x)))
            train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_product_description']], how='left',
                                      on='product_uid')
            product_df['len_brand'] = product_df['product_brand'].map(lambda x: len(homedepotTokeniser(x)))
            train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_brand']], how='left',
                                      on='product_uid')
            train_query_df['len_search_term'] = train_query_df['search_term'].map(lambda x: len(homedepotTokeniser(x)))

        if features.find("pmi") != -1:
            print("===========Performing pmi computation....this may take a while")
            timetracker.startTimeTrack()
            print(list(product_df))
            product_df['content'] = product_df['product_title'].map(str) + " " + \
                                    product_df['product_description'].map(str)

            timetracker.checkpointTimeTrack()

            print("Adding training query for that product id into the content")

            product_df = product_df.reset_index(drop=True)
            counter = 0
            for index, product in product_df.iterrows():
                # print("product:", product)
                productId = product['product_uid']
                # print("productId:",productId)
                df = train_query_df[train_query_df.product_uid == productId]
                # print("df:",df)
                searchterms = ""
                for index, row in df.iterrows():
                    searchterm = row['search_term']
                    searchterms = searchterms + " " + searchterm

                newString = product_df.iloc[counter]['content'] + " " + searchterms
                product_df.set_value(counter, 'content', newString)

                counter = counter + 1
            timetracker.checkpointTimeTrack()

            # Creating content
            text = product_df['content'].str.cat(sep=' ')
            pmiFeature = Feature_PMI.Feature_PMI(text)
            # print("PMI 'kitchen','cabinet': ", pmiFeature.computePMI('kitchen', 'cabinet'))
            train_query_df = pmiFeature.computePMIColumn(trainset=train_query_df)
            # print(list(train_query_df), "\n", train_query_df['pmi'])
            # train_query_df.filter(items=['id', 'pmi']).to_csv('pmi_features.csv')

        print("train_query_df final column:\n", train_query_df.info())

        return train_query_df
Exemple #18
0
 def __init__(self, driver):
     self.driver = driver
     Utility.__init__(self, driver)
Exemple #19
0
def start(a):
    global q
    q = a
    global node
    node.config['SECRET_KEY'] = Utility.createHexdigest(User.password)
    node.run(host="0.0.0.0", port=variables.PORT)
Exemple #20
0
# cols=['color_exist','len_product_description']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','len_brand']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','len_search_term']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','sense2vec_all_simscore','sense2vec_keeptag_simscore','sense2vec_uidfact_all_simscore','sense2vec_uidfact_keeptag_simscore','sense2vec_all_attr_simscore','sense2vec_keeptag_attr_simscore','sense2vec_uidfact_all_attr_simscore','sense2vec_uidfact_keeptag_attr_simscore']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','product_uid_threshold']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','noun_overlap_counts','noun_uniq_overlap_counts','noun_overlap_ratio']
# getFeatureRMSEAgainstBaseline(cols)

if __name__ == "__main__":
    # print("Should not print")
    utility = Utility()
    utility.startTimeTrack()
    # This part skips the feature training and simply use it.

    # print("Reading features_full_plusnouns set")
    # all_df=pd.read_csv('../data/features_full_plusnouns_pluspuidthresh.csv')

    myFeatureSetFileReference = '../data/features_doc2vec_sense2vec_pmi_20170418.csv'
    print("Reading features_doc2vec_sense2vec_pmi_20170418 set")
    all_df = pd.read_csv(myFeatureSetFileReference, low_memory=True)
    print("Completed: Reading features_doc2vec_sense2vec_pmi_20170418 set")
    feature_train_df = all_df[:74067]

    # feature_train_df.drop('doc2vec_search_term_vector', axis=1, inplace=True)
    # feature_train_df.drop('doc2vec_product_title_vector', axis=1, inplace=True)
    # feature_train_df.drop('doc2vec_product_brand_vector', axis=1, inplace=True)
Exemple #21
0
def exeFMBidModel(testDF=None, validateDF=None, trainDF=None, trainReader=None, validationReader=None, testReader=None, writeResult2CSV=False):
    print("============ Factorisation Machine bid model....setting up")

    timer = Utility()
    timer.startTimeTrack()

    print("Getting encoded datasets")
    trainOneHotData, trainY = trainReader.getOneHotData()
    validationOneHotData, valY = validationReader.getOneHotData(train_cols=trainOneHotData.columns.get_values().tolist())
    testOneHotData, testY = testReader.getOneHotData(train_cols=trainOneHotData.columns.get_values().tolist())
    timer.checkpointTimeTrack()

    print("trainOneHotData:",trainOneHotData.shape,list(trainOneHotData))
    print("trainY:", trainY.shape, list(trainY))
    print("validationOneHotData:",validationOneHotData.shape,list(validationOneHotData))
    print("valY:", valY.shape, list(valY))

    fmBidModel=FMBidModel.FMBidModel(cBudget=6250 * 1000, modelType='fmclassificationsgd')
    print("==========Training starts")
    # fmBidModel.gridSearchandCrossValidateFastSGD(trainOneHotData, trainY)
    # timer.checkpointTimeTrack()

    fmBidModel.trainModel(trainOneHotData,trainY, retrain=True, modelFile="data.pruned/fmclassificationsgd.pkl")
    timer.checkpointTimeTrack()

    print("==========Validation starts")
    predictedProb=fmBidModel.validateModel(validationOneHotData, valY)
    timer.checkpointTimeTrack()

    # print("==========Bid optimisation starts")
    # fmBidModel.optimiseBid(validationOneHotData,valY)
    # timer.checkpointTimeTrack()

    # best score      0.3683528286042599
    # noBidThreshold  2.833333e-01
    # minBid          2.000000e+02
    # bidRange        9.000000e+01
    # sigmoidDegree - 1.000000e+01
    # won             3.432900e+04
    # click           1.380000e+02
    # spend           2.729869e+06
    # trimmed_bids    0.000000e+00
    # CTR             4.019925e-03
    # CPM             7.952078e+04
    # CPC             1.978166e+04
    # blended_score   3.683528e-01

    # best score      0.3681133881545131
    # noBidThreshold  2.833333e-01
    # minBid          2.000000e+02
    # bidRange        1.000000e+02
    # sigmoidDegree - 1.000000e+01
    # won             3.449900e+04
    # click           1.380000e+02
    # spend           2.758561e+06
    # trimmed_bids    0.000000e+00
    # CTR             4.000116e-03
    # CPM             7.996061e+04
    # CPC             1.998957e+04
    # blended_score   3.681134e-01


    # New budget      6250000
    # FM
    # best score      0.32755084132163526
    # noBidThreshold  8.666667e-01
    # minBid          2.000000e+02
    # bidRange        2.500000e+02
    # sigmoidDegree - 1.000000e+01
    # won             1.461000e+04
    # click           1.170000e+02
    # spend           1.124960e+06
    # trimmed_bids    0.000000e+00
    # CTR             8.008214e-03
    # CPM             7.699932e+04
    # CPC             9.615043e+03
    # blended_score   3.275508e-01

    # print("==========Getting  bids")
    ## 25000 budget
    # bidIdPriceDF=fmBidModel.getBidPrice(validationOneHotData,valY,noBidThreshold=0.2833333,minBid=200,bidRange=100,sigmoidDegree=-10)
    ## 6250 budget
    # bidIdPriceDF=fmBidModel.getBidPrice(validationOneHotData,valY,noBidThreshold=0.8666667,minBid=200,bidRange=250,sigmoidDegree=-10)
    # print("bidIdPriceDF:",bidIdPriceDF.shape, list(bidIdPriceDF))
    # bidIdPriceDF.to_csv("mybids.csv")
    # timer.checkpointTimeTrack()

    return predictedProb
Exemple #22
0
import logging
FORMAT = "[{%(levelname)s} %(filename)s:%(lineno)s 	- %(funcName)20s() ] %(message)s"
logging.basicConfig(filename='scratch.log', level=logging.DEBUG, format=FORMAT)
from Blockchain_classes.Blockchain import Blockchain
import time
from Blockchain_classes.Block import Block
import User_classes.User as User
import Utilities.Utility as Utility

WORK = 5
genesis = Utility.create_genesis_block()

added = 0

blockchain = Blockchain(genesis)
while added < 100:
    last_block = blockchain.last_added()

    now = time.time()
    data = [{"from": "network", "to": User.public_key, "amount": 1.0}]
    done = False
    block = None
    while not done:
        effort, pow_hash_object = Utility.genhash(last_block.index + 1, now,
                                                  data, last_block.hash)
        #this is a test ....
        leading_zeroes = Utility.leadingzeroes(pow_hash_object.digest())
        if leading_zeroes >= WORK:
            done = True
    added += 1
    b = Block(last_block.index + 1, now, pow_hash_object.hexdigest(), effort,
    def __get_value(self, v, r, byte_order):
        if r is None:
            return None

        len = self.__get_block_length(v.type)
        if re.match("words:", v.type, re.M | re.I):
            var = []
            for a in r:
                h = ((a & 0xff00) >> 8)
                l = (a & 0xff)
                if re.search(r'(^ab($|cd$))|(^cdab$)', byte_order) is not None:
                    var.append(h << 8 | l)
                else:
                    var.append(a)
            # for a in Utility.toDoubleList(r):
            #     h0 = ((a[0] & 0xff00) >> 8)
            #     l0 = (a[0] & 0xff)
            #     h1 = ((a[1] & 0xff00) >> 8)
            #     l1 = (a[1] & 0xff)
            #     d = Utility.toDWord(l0, h0, l1, h1, byte_order, v.type)
            #     var.append(((d & 0xffff0000) >> 16))
            #     var.append((d & 0xffff))
            # logger.debug("get words data : %s" % var)
            logger.debug("Get words data.")
            return var
        elif re.match("bytes:", v.type, re.M | re.I):
            var = []
            for a in r:
                h = ((a & 0xff00) >> 8)
                l = (a & 0xff)
                if re.search(r'(^ab($|cd$))|(^cdab$)', byte_order) is not None:
                    var.append(h)
                    var.append(l)
                else:
                    var.append(l)
                    var.append(h)
            # logger.debug("get bytes data : %s" % var)
            logger.debug("Get bytes data.")
            return var
        elif re.match("string:", v.type, re.M | re.I):
            var = []
            for a in r:
                h = ((a & 0xff00) >> 8)
                l = (a & 0xff)
                if re.search(r'(^ab($|cd$))|(^cdab$)', byte_order) is not None:
                    # var.append(chr(h << 8 | l))
                    var.append(chr(h))
                    var.append(chr(l))
                else:
                    # var.append(chr(a))
                    var.append(chr(l))
                    var.append(chr(h))
            # logger.debug("get string data : %s" % ''.join(var))
            logger.debug("Get string data.")
            return ''.join(var)
        elif re.match("bits:", v.type, re.M | re.I):
            var = []
            for a in r:
                var.append(0 if a == 0 else 1)
            # logger.debug("get bits data : %s" % var)
            logger.debug("Get bits data.")
            return var
        elif re.match("dwords:", v.type, re.M | re.I):
            var = []
            for a in Utility.toDoubleList(r):
                h0 = ((a[0] & 0xff00) >> 8)
                l0 = (a[0] & 0xff)
                h1 = ((a[1] & 0xff00) >> 8)
                l1 = (a[1] & 0xff)
                var.append(Utility.toDWord(l0, h0, l1, h1, byte_order, v.type))
            # logger.debug("get dwords data : %s" % var)
            logger.debug("Get dwords data.")
            return var
        elif re.match("floats:", v.type, re.M | re.I):
            var = []
            for a in Utility.toDoubleList(r):
                h0 = ((a[0] & 0xff00) >> 8)
                l0 = (a[0] & 0xff)
                h1 = ((a[1] & 0xff00) >> 8)
                l1 = (a[1] & 0xff)
                var.append(Utility.toFloat(l0, h0, l1, h1, byte_order, v.type))
            # logger.debug("get floats data : %s" % var)
            logger.debug("Get floats data.")
            return var
        else:
            pass

        if len == 1:
            if v.type == 'bit':
                return 0 if r[0] == 0 else 1
            else:
                h = ((r[0] & 0xff00) >> 8)
                l = (r[0] & 0xff)
                if re.search(r'(^ba($|dc$))|(^dcba$)', byte_order) is not None:
                    if re.search('^unsigned', v.type):
                        t = ((l << 8) & 0xff00) | h
                        return t
                    else:
                        t = (((l & 0x7f) << 8) & 0xff00) | h
                        return t if (l & 0x80) == 0 else (t - 32768)
                else:
                    if re.search('^unsigned', v.type):
                        return r[0]
                    else:
                        return r[0] if (r[0] & 0x8000) == 0 else r[0] - 65536

        elif len == 2:
            if re.match("bits:", v.type, re.M | re.I):
                val = []
                for a in r:
                    val.append(0 if a == 0 else 1)
                return val
            else:
                h0 = ((r[0] & 0xff00) >> 8)
                l0 = (r[0] & 0xff)
                h1 = ((r[1] & 0xff00) >> 8)
                l1 = (r[1] & 0xff)
                if v.type == 'float':
                    return Utility.toFloat(
                        l0, h0, l1, h1, byte_order,
                        v.type)  # ieee754 converting,precision:default 2
                else:
                    return Utility.toDWord(l0, h0, l1, h1, byte_order, v.type)
        else:
            var = []
            for a in r:
                h = ((a & 0xff00) >> 8)
                l = (a & 0xff)
                if re.search(r'(^ab($|cd$))|(^cdab$)', byte_order) is not None:
                    var.append(h)
                    var.append(l)
                else:
                    var.append(l)
                    var.append(h)
            return var
Exemple #24
0
                    data=predicted)

        else:
            print("Error: No model was trained in this instance....")

        return predictedProb[:, 1]


if __name__ == "__main__":

    trainset = "data.final/train1_cleaned_prune.csv"
    validationset = "data.final/validation_cleaned.csv"
    testset = "data.final/test.csv"

    print("Reading dataset...")
    timer = Utility()
    timer.startTimeTrack()

    trainReader = ipinyouReader.ipinyouReader(trainset)
    validationReader = ipinyouReader.ipinyouReader(validationset)
    testReader = ipinyouReader.ipinyouReader(testset)
    timer.checkpointTimeTrack()
    print("Getting encoded datasets")
    trainOneHotData, trainY = trainReader.getOneHotData()
    validationOneHotData, valY = validationReader.getOneHotData(
        train_cols=trainOneHotData.columns.get_values().tolist())
    testOneHotData, testY = testReader.getOneHotData(
        train_cols=trainOneHotData.columns.get_values().tolist())
    timer.checkpointTimeTrack()

    print("trainOneHotData:", trainOneHotData.shape, list(trainOneHotData))
Exemple #25
0
import logging
FORMAT = "[{%(levelname)s} %(filename)s:%(lineno)s 	- %(funcName)20s() ] %(message)s"
logging.basicConfig(filename='scratch.log', level=logging.DEBUG, format=FORMAT)
from Mining.Block import Block

import time
import User.User as User
import Utilities.Utility as Utility
WORK = 3
BLOCKCHAIN = []
BLOCKCHAIN.append(Utility.create_genesis_block())
while True:
    if len(BLOCKCHAIN) == 2500:
        break
    last_block = BLOCKCHAIN[len(BLOCKCHAIN) - 1]
    now = time.time()
    data = [{"from": "network", "to": User.public_key, "amount": 1.0}]
    done = False
    block = None
    while not done:
        effort, pow_hash_object = Utility.genhash(last_block.index + 1, now,
                                                  data, last_block.hash)
        leading_zeroes = Utility.leadingzeroes(pow_hash_object.digest())
        if leading_zeroes >= WORK:
            done = True
    block = Block(last_block.index + 1, now, pow_hash_object.hexdigest(),
                  effort, data, last_block.hash)
    BLOCKCHAIN.append(block)
Utility.validate_blockchain(BLOCKCHAIN)
Exemple #26
0
    def preprocess(
        self,
        strings,
        verbose=False,
        removeNumericStrings=True,
        stringsRemovalThreshold=0.1,
        removeLinks=True,
        toLowerCase=True,
        stripTrailingPunctuation=True,
        punctuation='|&<>\“”"_=:!.,()?…\/{}][;:',
        removeEmptyEntries=True,
        removeShortEntries=True,
        minEntryLength=2,
        removeLongEntries=False,
        maxEntryLength=20,
        removeTheAAn=True,
        stemming=True,
        stripTrailingNumbers=True,
        removeNonAsciiWords=True,
    ):

        if verbose: print('preprocess: started with', len(strings), 'strings')

        if toLowerCase:
            strings = [s.lower() for s in strings]

        if removeNumericStrings:
            strings = [
                s for s in strings
                if Utility.getNumericContent(s) < stringsRemovalThreshold
            ]
            if verbose:
                print('preprocess: removeNumbers:', len(strings), 'strings')

        if removeLinks:
            strings = [
                s for s in strings if re.match(
                    'https?://(?:[-\*w.]|(?:%[\da-fA-F]{2}))+', s) == None
            ]
            if verbose:
                print('preprocess: removeLinks:', len(strings), 'strings')

        if removeLongEntries:
            strings = [s for s in strings if len(s) <= maxEntryLength]
            if verbose:
                print('preprocess: removeLongEntries:', len(strings),
                      'strings')

        if removeTheAAn:
            theAAn = ['the', 'a', 'an']
            strings = [s for s in strings if s not in theAAn]
            if verbose:
                print('preprocess: removeTheAAn:', len(strings), 'strings')

        if removeNonAsciiWords:
            strings = [s for s in strings if Utility.isAscii(s)]
            if verbose:
                print('preprocess: removeNonAsciiWords:', len(strings),
                      'strings')

        if stripTrailingPunctuation:
            for i, s in enumerate(strings):
                for p in punctuation:
                    strings[i] = s.rstrip(p).lstrip(p)

        if stripTrailingNumbers:
            numbers = '1234567890'
            for i, s in enumerate(strings):
                for p in numbers:
                    strings[i] = s.rstrip(p).lstrip(p)

            for i in range(len(strings)):
                for p in punctuation:
                    strings[i] = strings[i].rstrip(p).lstrip(p)

            newstrings = []
            for i in range(len(strings)):
                s = strings[i]
                for p in punctuation:
                    s = s.rstrip(p).lstrip(p)
                newstrings.append(s)
            strings = newstrings

        if removeEmptyEntries:
            strings = [s for s in strings if s != None]
            if verbose:
                print('preprocess: removeEmptyEntries:', len(strings),
                      'strings')

        if removeShortEntries:
            strings = [s for s in strings if len(s) >= minEntryLength]
            if verbose:
                print('preprocess: removeShortEntries:', len(strings),
                      'strings')

        if stemming:
            strings = [stem(s) for s in strings]

        return strings
Exemple #27
0

graph = {
        '1': ['2', '3', '4'],
        '2': ['5', '6'],
        '5': ['9', '10'],
        '4': ['7', '8'],
        '7': ['11', '12']
        }
startState = startStateHard

# startState = startStateEasy
# startState = startStateMedium
# startState = startStateHard

utility = Utility()
blankIndex = utility.getBlankIndex(startState)
stringRepOfStart = utility.getStringRep(startState)
stringRepOfGoal = utility.getStringRep(goalState)
startNode = BoardNode(startState, 'NULL', 'NULL', blankIndex, 1,stringRepOfStart)
goalNode = BoardNode(goalState, "NULL", "NULL", (1, 1),-1,stringRepOfGoal)

bfsSearch = BreadthFirst()
dfsSearch = DepthFirst()
dfsLimited = DepthLimited()
incDFSLimited = IncrementalDepthLimited()
bestFirstSearch = BestFirst()
astar = A_Star()
start_time = time.time()

astar.aStarSearch(startNode,goalNode,H1)