Exemple #1
0
    def _learn_tree(self, data, attributes, depth):
        if not len(attributes) or depth + 1 == self.max_depth:
            leaf = Node()
            leaf.set_is_leaf(utils.most_common(data))
            return leaf

        if utils.all_the_same_label(data):
            leaf = Node()
            label = data.raw_data[0][0]
            leaf.set_is_leaf(label)
            return leaf

        base_entropy = utils.entropy(data)
        attribute_name, attribute_ig = utils.best_attribute(
            data, attributes, base_entropy)
        attribute = data.attributes[attribute_name]
        root = Node(attribute_name, attribute.possible_vals, attribute.index)
        depth += 1

        for attribute_value in root.possible_vals:
            b = Branch(attribute_value)
            root.add_branch(b)
            data_sample = data.get_row_subset(attribute_name, attribute_value)

            if not len(data_sample):
                leaf = Node()
                leaf.set_is_leaf(utils.most_common(data))
                b.set_child(leaf)
            else:
                attributes = utils.remove_attribute(attributes, attribute_name)
                b.set_child(self._learn_tree(data_sample, attributes, depth))

        return root
Exemple #2
0
def measureAccuracy(X, Cindex, n, numberOfClasses):
    classTrue = [0] * n
    classPred = [0] * n

    #get true labels
    for j in range(0, n):
        classTrue[j] = X[j]['class']

    #pick a label for each class that gives the best results
    Cindex.sort(key=len)
    usedLabels = []
    for j in range(0, numberOfClasses):
        temp = []
        for i in range(0, len(Cindex[j])):
            if not X[Cindex[j][i]]['class'] in usedLabels:
                temp.append(X[Cindex[j][i]]['class'])

        for i in range(0, len(Cindex[j])):
            classPred[Cindex[j][i]] = most_common(temp)
            usedLabels.append(most_common(temp))

    #calc presision and recall values
    result = ratio(classTrue, classPred)
    print("Precision:", result[0])
    print("Recall:", result[1])

    return classPred
Exemple #3
0
 def recover_derg(self, app_derg):
     """
     recover third party library nodes given an obfuscated derg
     :param app_derg:
     :return:
     """
     assert (isinstance(app_derg, DERG))
     for package in app_derg.get_packages():
         package_derg = app_derg.get_package_derg(package)
         matched_package, mapping = self.match_3lib_package(package_derg)
         if matched_package and mapping:
             matched_package_name = utils.most_common(
                 matched_package['packages'])
             print("matched third party package: %s" % matched_package_name)
             matched_derg = matched_package['derg']
             for matched_id, node_id in mapping.items():
                 node = app_derg.g.nodes[node_id]
                 node_type = node['type']
                 if node_type in STATIC_NODE_TYPES:
                     continue
                 node_type = node_type.split('_')[0]
                 if node_type in ['package', 'class', 'method', 'field']:
                     matched_node = matched_derg.g.nodes[matched_id]
                     node['type'] = node_type + '_3lib'
                     node['recovered_name'] = matched_node['name']
                     node['recovered_sig'] = matched_node['sig']
             print("recovered third party package: %s" %
                   matched_package_name)
     return app_derg
Exemple #4
0
    def fast_match_3lib_package(self, package_derg):
        for lib_package in self.lib_packages:
            lib_derg = lib_package['derg']
            lib_package_name = utils.most_common(lib_package['packages'])

            lib_hashes = lib_derg.get_node_hashes()
            package_hashes = package_derg.get_node_hashes()
            common_hashes = lib_hashes.intersection(package_hashes)

            common_count = len(common_hashes)
            if common_count < 3:
                continue

            precision = float(common_count) / len(package_hashes)
            recall = float(common_count) / len(package_hashes)
            if precision > 0.9:
                return lib_package_name
        return None
Exemple #5
0
 def match_3lib_package(self, package_derg, isomorphism_timeout=10):
     for lib_package in self.lib_packages:
         lib_derg = lib_package['derg']
         lib_package_name = utils.most_common(lib_package['packages'])
         if not lib_derg.get_node_hashes().issuperset(
                 package_derg.get_node_hashes()):
             continue
         GM = isomorphism.DiGraphMatcher(
             lib_derg.g,
             package_derg.g,
             node_match=ThirdPartyLibRepo.node_match,
             edge_match=ThirdPartyLibRepo.edge_match)
         try:
             with utils.timeout(isomorphism_timeout):
                 if GM.subgraph_is_isomorphic():
                     return lib_package, GM.mapping
         except:
             print("graph isomorphism timeout during matching %s" %
                   lib_package_name)
     return None, None
Exemple #6
0
    def run_inference_on_image(self):
        """
        Run inference on images
        * Creates the graph
        * Loads the image
        * Performs inference
        """
        self.detecting = True
        answer = None

        ####
        # STREAM PREDICTIONS ON IMAGES
        ####

        # Check if image directory exists
        if not tf.gfile.Exists(self.image_path):
            tf.logging.fatal('File does not exist %s', self.image_path)
            return answer

            # Get images list
        file_list = glob.glob(os.path.join(self.image_path, '*.jpeg'))

        if len(file_list) > 0:
            answers = []
            scores = []
            for im_path in file_list:
                # Load image
                image_data = tf.gfile.FastGFile(im_path, 'rb').read()

                # Run inference
                predictions = self.sess.run(self.softmax_tensor, {
                    'DecodeJpeg/contents:0': image_data,
                    'Placeholder_1:0': 1.0
                })
                predictions = np.squeeze(predictions)

                # Getting top prediction
                top_1 = predictions.argsort()[-1:][0]

                answers.append(self.labels[top_1])
                scores.append(predictions[top_1])

            # Get most common label
            most_common_label = utils.most_common(answers)

            # Get average probability of most common label
            proba_most_common = utils.proba_most_common(
                most_common_label, answers, scores)

            self.detecting = False

            # Send to Rabbit MQ
            if proba_most_common > 0.8:
                self.imageDetected = True
                if self.enqueueClear == False:
                    print('image detected (label image)')
                    self.sender.obstacleDetection(
                        '{"type": "OBSTACLE","payload": { "obstacle": "' +
                        self.should_stop_train(most_common_label.upper()) +
                        '", "obstacleType": "' + most_common_label.upper() +
                        '"}}')
                else:
                    self.clear_detection()
            else:
                self.send_clear()

            self._clear_folder(self.image_path)

            print "Most common label: %s" % most_common_label
            print "Probability: %s" % proba_most_common

            time.sleep(2)

        else:
            print "No data"
            time.sleep(5)
Exemple #7
0
 
 
 notcompletebetters = []
 for m in matchdata:
     for b in betkeys:
         if b not in m.keys() and b not in notcompletebetters:
             notcompletebetters.append(b)    
 
 
 betkeys = [k for k in betkeys if k not in notcompletebetters]
 betpreds = []
 for m in matchdata:
     tmp = []
     for b in betkeys:
         tmp.append(m[b])
     tmp.append(most_common(tmp))#Calculate book maker prediction as most common prediction among all bookmakers
     m['BetResult'] = most_common(tmp)
     tmp.append(m['FTR'])
     betpreds.append(tmp)
 data['matchdata'] = matchdata
 
 
 betkeys.append('MajorityBet')
 betkeys.append('Result')
     
 bfile = open("betters.csv", "wb")#Stores the better predictions for each match
 writer = csv.writer(bfile, quoting=csv.QUOTE_ALL)
 writer.writerow(betkeys)
 for tr in betpreds:
     writer.writerow(tr)
 bfile.close()
def heuristic_1(fae, sae, och):
    global output_file_1, output_file_2

    funding_address_entity = {k: v for k, v in fae.items()}
    settlement_address_entity = {k: v for k, v in sae.items()}
    r = dict()
    r['n_funding_entities'] = len(set(funding_address_entity.values()))
    r['n_settlement_entities'] = len(set(settlement_address_entity.values()))
    r['n_entities'] = len(
        set(settlement_address_entity.values()).union(
            set(funding_address_entity.values())))
    r['n_addresses'] = len(
        set(settlement_address_entity.keys()).union(
            set(funding_address_entity.keys())))
    r['n_nodes'] = len(node_channels)

    funding_address_entity, settlement_address_entity, = \
        set_mapping(funding_address_entity, settlement_address_entity, och)

    # print('Start heuristic 1...')
    blockstream_funding_txs = read_json(input_file1)

    # print('use_entities', use_entities)

    # # mapping between stx and its ftx
    stx_its_ftx = dict()
    for channel in channels.values:
        funding_tx, out_index = channel[0].split(':')
        funded_address = \
            funding_txs[funding_tx]['outputs'][int(out_index)]['address']
        settlement_txs = funded_address_settlement_txs[funded_address]
        if len(settlement_txs) == 1:  # it is always zero or one tx
            stx = settlement_txs[0]['tx_hash']
            if stx not in stx_its_ftx:
                stx_its_ftx[stx] = funding_tx
            else:
                print('stx already in dict', stx)

    # create links for heuristic 1 (both at address and entity level)
    stx_a_ftx = []  # list of settlement tx, address, funding tx
    for uftx in blockstream_funding_txs.values():
        for i in uftx['vin']:
            a = i['prevout']['scriptpubkey_address']
            prev_tx = i['txid']
            if a in settlement_addresses:
                if prev_tx in settlement_txs_hashes:
                    stx_a_ftx.append([prev_tx, a, uftx['txid']])
    #             else:
    #                 # a is a settlement_address but prev_tx is not a
    #                 settlement_tx in our data

    stx_e_ftx = []  # list of settlement tx, entity, funding tx
    for uftx in blockstream_funding_txs.values():
        for i in uftx['vin']:
            e = funding_address_entity[i['prevout']['scriptpubkey_address']]
            prev_tx = i['txid']
            if e in settlement_address_entity.values():
                if prev_tx in settlement_txs_hashes:
                    stx_e_ftx.append([prev_tx, e, uftx['txid']])

    tx_2in1 = '88679369ec778d5187c207676c788e7d22272e64c120e0cd6e06858864bdb5e9:1'
    # I need a mapping between funding tx and nodes
    # (ignore case with two funding txs in one tx cause one channel is still open)
    # and between settlement tx and nodes
    ftx_nodes = dict()
    for channel in channels.values:
        if channel[0] != tx_2in1:
            ftx = channel[0].split(':')[0]
            ftx_nodes[ftx] = [channel[1], channel[2]]

    funded_address_channel = dict()
    for channel in channels.chan_point.values:
        hsh, out_index = channel.split(':')
        funded_address = funding_txs[hsh]['outputs'][int(out_index)]['address']
        if funded_address not in funded_address_channel:
            funded_address_channel[funded_address] = channel
        else:
            print(funded_address, ' has multiple channels')

    stx_nodes = dict()
    for fa, channel in funded_address_channel.items():
        if channel != tx_2in1:
            stxs = funded_address_settlement_txs[fa]
            ftx = channel.split(':')[0]
            if stxs:
                stx = stxs[0]['tx_hash']
                stx_nodes[stx] = ftx_nodes[ftx]

    # print('Initial number of links addresses', len(stx_a_ftx))
    # print('Initial number of links entities', len(stx_e_ftx))

    # decide link level: address or entity
    triplet = stx_a_ftx
    if use_entities:
        triplet = stx_e_ftx

    links = []  # like stx_a_ftx plus 4 nodes of channels
    for el in triplet:
        # the funding entity controls the node in common between the channel
        # opened with ftx and closed with stx
        stx, a, ftx = el
        n1, n2 = ftx_nodes[ftx]  # happens after the stx
        n3, n4 = stx_nodes[stx]
        links.append([stx, a, ftx, n1, n2, n3, n4])

    useful_links = []
    for link in links:
        s = set(link[3:])
        if len(s) == 3:
            useful_links.append(link)

    # if closing of other node in ch1 > opening of other node in ch2
    # then we can use the link
    usable_links = []
    for link in useful_links:
        node_in_common = most_common(link[3:])
        other_node_ch1 = ''
        other_node_ch2 = ''
        for node in link[3:][::-1]:
            if node != node_in_common:
                if not other_node_ch1:
                    other_node_ch1 = node
                else:
                    other_node_ch2 = node
        if node_openings_closings[other_node_ch1]['last_activity'] > \
                node_openings_closings[other_node_ch2]['first_activity']:
            usable_links.append(link)

    reliable_links_addresses = []
    for link in usable_links:
        link_address = link[1]
        stx = link[0]
        its_ftx = stx_its_ftx[stx]
        if link_address in [
                el['address'] for el in funding_txs[its_ftx]['inputs']
        ]:
            reliable_links_addresses.append(link)
    # print('Number of reliable links at address level:',
    #       len(reliable_links_addresses))

    reliable_links_entities = []
    entities_reusing = set()
    for link in usable_links:
        if use_entities:
            link_entity = link[1]
        else:
            link_entity = settlement_address_entity[link[1]]
        stx = link[0]
        its_ftx = stx_its_ftx[stx]
        if link_entity in [
                funding_address_entity[el['address']]
                for el in funding_txs[its_ftx]['inputs']
        ]:
            entities_reusing.add(link_entity)
            reliable_links_entities.append(link)

    # print('Number of reliable links at entity level:',
    #       len(reliable_links_entities))
    # print('Number of entities reusing funding addresses:', len(entities_reusing))

    # step 1: linking nodes to entity using stx and ftx
    # print('Step 1:')
    heuristic_1a_entity_node = dict()
    heuristic_1a_node_entity = dict()
    for link in reliable_links_entities:
        if use_entities:
            e = link[1]
        else:
            e = settlement_address_entity[link[1]]
        n = most_common(link[3:])
        if e not in heuristic_1a_entity_node:
            heuristic_1a_entity_node[e] = set()
        heuristic_1a_entity_node[e].add(n)
        if n not in heuristic_1a_node_entity:
            heuristic_1a_node_entity[n] = set()
        heuristic_1a_node_entity[n].add(e)
    # print('Number of entities linked to nodes:', len(heuristic_1a_entity_node))
    # print('Number of nodes linked to entities:', len(heuristic_1a_node_entity))

    # print('Step 2:')
    # link other node and entity in channel
    heuristic_1b_entity_node = link_other_nodes(heuristic_1a_entity_node,
                                                channels,
                                                funded_address_settlement_txs,
                                                funding_txs,
                                                settlement_address_entity)
    heuristic_1b_node_entity = invert_mapping(heuristic_1b_entity_node)

    # correct means that the settlement tx has exactly two output entities
    correct_stxs = []  # correct stxs
    correct_settlement_entities = set()  # output entities of correct stxs
    correct_nodes = set()
    for channel in channels.values:
        funding_tx, out_index = channel[0].split(':')
        node_1 = channel[1]
        node_2 = channel[2]
        funded_address = \
            funding_txs[funding_tx]['outputs'][int(out_index)]['address']

        settlement_txs = funded_address_settlement_txs[funded_address]
        # if channel is closed and number of outputs == 2 and
        # one node is mapped to one entity in the outputs
        if settlement_txs:  # it is always only one
            for settlement_tx in settlement_txs:
                # count entities
                entities = set([
                    settlement_address_entity[out['address']]
                    for out in settlement_tx['outputs']
                ])
                if len(entities) == 2:
                    correct_stxs.append(settlement_tx)
                    correct_settlement_entities = correct_settlement_entities.union(
                        entities)
                    correct_nodes.add(node_1)
                    correct_nodes.add(node_2)

    perc_entities_linked_settled = round(
        100 * len(heuristic_1b_entity_node) / r['n_settlement_entities'], 2)
    perc_entities_linked_2e = round(
        100 * len(heuristic_1b_entity_node) / len(correct_settlement_entities),
        2)
    perc_nodes_linked_2e = round(
        100 * len(heuristic_1b_node_entity) / len(correct_nodes), 2)

    r = get_results(r, heuristic_1b_entity_node, heuristic_1b_node_entity)

    print('Number of settlement entities:', r['n_settlement_entities'], '--',
          perc_entities_linked_settled, '% linked')
    print(
        'Number of settlement entities considering settlement txs with 2 output entities:',
        len(correct_settlement_entities), '--', perc_entities_linked_2e,
        '% linked')
    print('Number of nodes considering settlement txs with 2 output entities:',
          len(correct_nodes), '--', perc_nodes_linked_2e, '% linked')

    addresses_linked = set()
    for address_entity in [funding_address_entity, settlement_address_entity]:
        for address, entity in address_entity.items():
            if entity in heuristic_1b_entity_node:
                addresses_linked.add(address)
    r['perc_addresses_linked'] = round(
        100 * len(addresses_linked) / r['n_addresses'], 2)

    output_file_a = output_file_1
    output_file_b = output_file_2
    if och['stars']:
        output_file_a = output_file_4
        output_file_b = output_file_5
    if och['none']:
        output_file_a = output_file_6
        output_file_b = output_file_7
    if och['snakes']:
        output_file_a = output_file_10
        output_file_b = output_file_11
    # if och['collectors']:
    #     output_file_a = output_file_8
    #     output_file_b = output_file_9
    if och['proxies']:
        output_file_a = output_file_8
        output_file_b = output_file_9
    if och['all']:
        output_file_a = output_file_1
        output_file_b = output_file_2

    # Write to file
    heuristic_1_entity_node = {
        str(k): [e for e in v]
        for k, v in heuristic_1b_entity_node.items()
    }
    heuristic_1_node_entity = {
        k: [int(e) for e in v]
        for k, v in heuristic_1b_node_entity.items()
    }
    print(och)
    print('writing to', output_file_a, output_file_b)
    write_json(heuristic_1_entity_node, output_file_a)
    write_json(heuristic_1_node_entity, output_file_b)

    return r
def main():
    sequences, total_records = utils.readGenome('dna4.fasta')

    # question 1: total records in the file
    print total_records # answer: 22

    # question: longest sequence in the file?
    lengths = [len(i) for i in sequences]
    print(max(lengths)) # answer: 4815
    print(min(lengths)) # answer 40


    # question: what is the length of the longest ORF appearing in reading frame 1 of any of the sequences?
    max_orf = 0
    for i in sequences:
        cur_seq = utils.find_orf(i, 0)
        for j in cur_seq:
            if len(j) > max_orf:
                max_orf = len(j)
    print max_orf # answer 1767

    # question: what is the length of the longest ORF appearing in any sequence and in any forward reading frame?
    max_orf = 0
    for i in sequences:
        cur_seq = utils.find_orf(i, 1)
        for j in cur_seq:
            if len(j) > max_orf:
                max_orf = len(j)
    print max_orf # answer 1770

    # question: what is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|349?
    test_orf = "GATCGCCGCCTGGGTTGTCGAGACACCTGCGCGTGCGCGTCGAACGAAACACCTTGACCCACCGTATGCC CGGCACCGCGCGCGTCCCGGCCGACCTCGCGACACCGAGCGGCACCGCTTCGAAGCATTCTAGCCGGCTC GCGCTTCGCGAACCACCTTTTCGGACGAAAATCCGCACGTTGAATCACTTTCCTGCTTCGTATTTCACGC AAACTGCGTACAATCCTGAGACAACAGTACGTCAACTTCAGGAGAGCAACGATGCCCCCTCGCAAGGATC GCGATACGCCCCATCGCTATCGCAGCGGCGAGGCCGCGCGCCTGGCGCGCATGCCGGCAGCCACGCTGAG AATCTGGGAACGGCGCTATGGCGTGGTTGCGCCGCCCAAAACGCCGTCCGGACAACGGCTGTACTCGGAC GACGACGTGCAGCGCATTCGATTGCTGAAAACGCTCGTCAATCAGGGCCACGCGATCGGGTCGATCGCCA GCCTGAGCCGCGAGGAACTCGAGGCGTTGTCGTTGACGAATACGCGTGACCCGGCGTTTCACGAGGCAAG TGTGAGCCTCGCGGTCGTCGGCGCGCTTTCGATTCCGGAAGCCGCGATCGAGCGAATGGGAATCCGGATC GCCGCGCGAATCGACTCGCTCGACGACACGAGCGCGCATGCGGGTACGTCGGTCGATGCCCTCATCGCGA CGACCACGTCGCTCCATGAGGATGTCGTTTCGCAGCTCGCTGCCCAGGCGCAACAGCTCAACGCGCACGC CGTGGCCGTCGTATACGGGTTCGGCACGGCAGAAGCGGTCGAGCTGGCGCGTCTGTCGGGGTTCGAGCTG TTCCGGTCGACGGAAGGCCAGACCAACCCGATATCGATCATTTCGAAACTGGCGCAAGCCGTCGTCAAGT CGCGCCAATCGAATGACGCGGATCGCGGGCTCTGGCTGCGCACGCGGCGACGCTTCGACGAGGCGACGCT CGCGTCGCTCAGCGGCCTGTCCACCACCGTCAAATGCGAGTGTCCGCGTCACCTCTCCGAATTGATCATG CAGCTCAGTGCGTTCGAGCGATACAGCGACGAATGCGTGTCGCGATCGCCGGCCGATGCGCTGCTGCACC GCCACCTTGGAGACGCAGCGAACCGGGCAGCCGAATTGCTCGAGACGGCGCTTGCCGTCATTCTCCGCGA AGAGGGATTGGGCGGGACGACGCCGGAACTGAAGGCGCTGTAGCGCGGCACGCGCCGCCGGCTGTTCCGA CCTGCCGACGACGGCAGGTGGCGATGCTCTTTCGCGTGCAATGCAGGGCTTGCGTCGATCACTGAGCCGA AACGGAAGAACGAGCCGCTGCGGCAGGCGATGCCGGCGGCCTGCCCGTGGTTCCGGCATTCGACGCATGC GCGACTCGATCCACGAACGCGGAGAGATCGTCGAGCACTGACGCCATCCCTCTCAACGGCGCGCCCAGAA CACCGACGATCGACGCATGGCCCACGTTCCGGTAACGCATGACCACGACGGTGTCGCCCTTCTCTTGCAA TGCTCGCGCGAAGCGGGTCGTGTTGCCGGGCTCGACCACGGTGTCGTTCTCCGCGGTGGCCAGCCACATC GGCGGCTCCGTACCCTGAATGAACCGGATGGGCTGGCTCGCGGCCCGCACTTCCTGCGGGAATATCCTTT CAAGCGTGGTATCGCGCAGCGGCAGGAAATCATAAGCCCCGGCCAGGCCAATCACGCCGGCGATATCGCT CTTCCGCATCGCCTGTGCCGCCAGATAGCGGCCGTCGGTCGCAAGCAATGCGGCAATCTGCGCGCCCGCG GAATGCCCCATCAGAAACAGGCGATGTGGATCGCCGCCGAACGCAACCGCGTGCTCGCGTGCCCACGCGA CCGCCTGCGCCGCATCGTCGACGAAACCGGGAAAGGTGGTCGCCGGATACGTCCTGTAGTCGGGTAAGAC GGCAACAAAGCCCCTCGACGCGAGCGCCTCTCCCACGAACAGATAGTCCTTGCGCTCGCCGGACTGCCAG CTTCCGCCGTAAAGGAACACGACCACAGGGGCGCCCGCACTCGCATCGGCCGGCCAGTGATGCAAGACGC GCGTGGGCAAATAGACGTCGAGCACCTGGCGTTCGCCGGATCCGTACGGGATACCTGCGAACAGACTGAA CGTGTAGCTCGGCGTCAGCGCATTCAGGAGCCGCACCGGGCTGCACGCGGAGAGGAGACCGGCCGCGAGC AGCACCGACAGGACGACAAGCCCGGCTTTCATGTTCATGGAGATCCCCATTCCTGACGATTCCGGCCGCA TCCGCCGCCTGGTACGAGGTTTACGGCGCTTGCGCGCAAGCGGATGCACGCATCGCATGGCAACCGCGCC CCTTGACGGCATCCAGATCTTTCCTGCGCAAGTGCATCCGTCCGCAACGGAGAGTCGTATGTGAATGGAT AGGTGAATCAACGCGGAATGCCGACCATCGCTCGCTGCAAAGCAATCGTCCGGTGGCGAGTCCGCTCGTC GACGATAGTGAGAGCCGTCTGCCATGAGCGTTCTACCTGCCACTTACCCCGAGATGCAACGTCGACGTGG CGGCACGGCGACCATGCCGTTACCGATGATCCCGCGCGAACGATCATGAGGAGCGCGCCGAATCAACTGA CGTCGAGCACGCAAAAGTCCGGCGCCGCTCGCGTGTACGTCTATCTCGCGACGACCCAGACAGGATGGCT GGTATGCGTGATGACTGCCGCAGCGCATCACGCCGCGTGGGGCGTCACCTATGCGCTGATCGCGACAGCG GGCCATCTTCTCTTCGCGCGTCGGCCCGCATCCGAGGCGCGGATCGTCATCACGGTCACGGTGTCCGGAT GGTTATGGGACAGCGCCGTTGCACATTCCGGCCTGCTCGTGTACCCGAACGGCGTTTTTCTCAAAGGTAC AGCGCCGTACTGGCTCGCGGGGCTGTGGGCGCTGTTCGCGATTCAACTCAACACCTTGCTGCTCTGGCTT CGGGCGCGACCGCTCGTCTCGGCGCTCGTCGGCGCATTCGCAGGCCCCGCATCCTTTCGCGCAGGTGCGG CGCTGGGGGCCGTTCATTTCAAAGACTCGGCTGCAGCGCTCGTCGTTCTCGCAACCGGCTGGGCGTTCAT CTTGCCGGCCGCGCTTGCGATTGCAAGCCATTGGGATGGCGTAACGCCCCCTTCTCCTCCGCCAATCGGC GCAGGCGACATGAATGACGCCCGCGCCGGATAGAGCCGGACGCGTCGTAAGCCAGCGTTATCTCCGATCC CGTTCAAATTGCCAACGTACCTTCTCAGGCACCACACATGACACGCACCGAATTGCCGTATGAATCCCGC CCCGTTATCGTATGGTTTCGGGATGACCAACGACTCAGCGACAATCCCGCACTCTCTCATGCGGTCAGTA CCGGCCATCCTGTTGTTTGCGTCTACGTCTACGACCCTGCCCCGAAGCTCGGGCGCGCCATGGGGGGCGC GCAGAAGTGGTGGCTGCACGAGTCGTTGAAAAAACTCGACGACTCGCTTTCCGCTCTCGGCGGCTCGCTG CTCGTGCTTCGCGGTAACGAACACGAAGCCATCAGGAGCCTCGCCGTCGAGACCCGGGCGGCAATGGTTT TCTGGAATCGCCGCTACTCGAAAGCGCAAACGGAAATGGATGCATCGATCAAGAAAGACCTGATCGGGCG CGGCATCGACGTGTCGACATTCAATGGCCATCTTTTGCGCGAACCCTGGACAGTGGCCACGCGCGAAGGC TTGCCGTTCCAGGTATTCAGCGCGTACTGGAGAGCCGCTCGCCGCGATAATTTTTTCCCGCCGTGCCCAC TGTCGGCGCCCGCCCGGGTCACGTTCTTTCCCGTCTCCAGAAACGTCAGCGCACACGTCTGTACGCTTCC CGCGCTTGCACTGCAGCCCTCGACGCCGGACTGGGCGGAGGGCCTGCGTGCAACCTGGCGATGCGGCGAG GAAGCGGCCGGGCATCAACTCGAGGCCTTCATTGAACACTCGTTTTCCGACTATGCCGGCGCTCGAGATT TTCCGGCCACTCGAGCGACGAGCCGGCTCTCTCCGTATCTTCGCTTCGGAAATATCTCGGCCCGGCAGGT GTGGTACGCGACGTTATCAGCGGTAGACGCGATGCGAAGCAGGCGAGTTGTTCGCATTGACGATGCCAAA AATGAGTCGTTGAACAAGTTCTTCAGTGAACTCGGATGGAGAGAATTCTCGTATTACCTTCTTTACCACT GCGAACCCCTTCATCAGGTCAATTTCCGGCGTCAGTTTGACGCCATGCCGTGGCGTACCGACGCCAAGGC GCTTCGCGCGTGGCAAAGGGGGAAAACAGGATACCCGCTGGTCGACGCCGGCATGCGCGAGCTTTGGCAC ACGGGCTGGATGCACAACCGCGTGCGCATGGTGACAGCGTCATTTCTCACCAAGCACTTGCTGATCGACT GGCGCGAGGGCGAAGCATGGTTCTGGGATACGCTGGTTGACGCG"
    max_orf = 0
    cur_seq = utils.find_orf(test_orf, 0)
    for j in cur_seq:
        if len(j) > max_orf:
            max_orf = len(j)
    cur_seq = utils.find_orf(test_orf, 1)
    for j in cur_seq:
        if len(j) > max_orf:
            max_orf = len(j)
    cur_seq = utils.find_orf(test_orf, 2)
    for j in cur_seq:
        if len(j) > max_orf:
            max_orf = len(j)
    print max_orf # 972 or 975

    # q: find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?
    all_repeats = []
    for i in sequences:
        repeats_list = utils.get_all_repeats(i,6)
        for j in repeats_list:
            all_repeats.append(j)
    print(all_repeats.count(utils.most_common(all_repeats))) # answer 208

    # q: find all repeats of length 11 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 11. How many different 11-base sequences occur Max times?

    all_repeats = []
    for i in sequences:
        repeats_list = utils.get_all_repeats(i,11)
        for j in repeats_list:
            all_repeats.append(j)
    print Counter(all_repeats).most_common(10) # answer: 5
    seq1=0
    seq2=0
    seq3=0
    seq4=0
    for i in sequences:
        repeats_list = utils.get_all_repeats(i,7)
        for j in repeats_list:
            if j == 'CGGCGCG':
                seq1 +=1
            if j == 'CGGCACG':
                seq2 +=1
            if j == 'GCGGCAC':
                seq3 +=1
            if j == 'TCGGCGG':
                seq4 +=1
    print str(seq1) + " | " + str(seq2) + " | " + str(seq3) + " | " + str(seq4) + " | "