def anomaly_selection(files_map_file,
                      anomalies_output_file,
                      use_dbscan,
                      differences_file=None,
                      differences=None):
    if differences_file:
        if use_dbscan:
            differences = binary_read(differences_file)
        else:
            differences = ascii_read(differences_file)

    if use_dbscan:
        anomalies = dbscan_anomaly_selection(differences)
    else:
        anomalies = three_sigma_anomaly_selection(differences)

    anomalies_write_time_logger = TimeLogger()
    with open(files_map_file) as files_map_file_descriptor:
        files_map = json.loads(files_map_file_descriptor.read())
        anomaly_files = []
        if use_dbscan:
            for anomaly_index in anomalies:
                anomaly_files.append(files_map[anomaly_index])
        else:
            for anomaly_index, anomaly_value in anomalies:
                anomaly_files.append((files_map[anomaly_index], anomaly_value))

        with open(anomalies_output_file,
                  'w') as anomalies_output_file_descriptor:
            anomalies_output_file_descriptor.write(json.dumps(anomaly_files))

    print('Anomaly list written. Time: ' +
          str(anomalies_write_time_logger.finish()))

    return len(anomaly_files)
Beispiel #2
0
    def write_file(self):
        time_logger = TimeLogger()

        try:
            content = self.obj.decoded_content.decode('utf-8')
        except Exception as e:
            pprint(e)
            if isinstance(e, RateLimitExceededException):
                print('File is skipped. Waiting for 1 minute.')
                with open('rate_limit_exceeded_exceptions.log',
                          'a') as exceptions_descriptor:
                    exceptions_descriptor.write(self.number + os.linesep)
                time.sleep(60)
                return self.write_file()
            elif isinstance(e, UnknownObjectException):
                print('File is skipped because not found.')
                with open('unknown_object_exceptions.log',
                          'a') as exceptions_descriptor:
                    exceptions_descriptor.write(self.number + os.linesep)
            return None

        path = ContentSaver.save(self.directory,
                                 self.number,
                                 content,
                                 ext='kt')

        time_logger.finish(task_name='Write ' + path + ' (#' +
                           str(self.number) + ') file')
def dbscan_anomaly_selection(differences):
    dbscan_time_logger = TimeLogger()
    labels = DBSCAN(eps=3, min_samples=5,
                    metric='euclidean').fit_predict(differences)
    anomaly_indexes = [i for i, x in enumerate(labels) if x == -1]
    print('DBSCAN finished its work. Time: ' +
          str(dbscan_time_logger.finish()))

    return anomaly_indexes
def ascii_read(differences_file):
    differences_read_logger = TimeLogger()
    with open(differences_file) as f:
        differences = json.loads(f.read())

        difference_indexes = []
        difference_values = []
        for difference in differences:
            difference_indexes.append(difference[0])
            difference_values.append(difference[1])

    print('Differences read finished. Time: ' +
          str(differences_read_logger.finish()))

    return difference_indexes, difference_values
def three_sigma_anomaly_selection(differences):
    three_sigma_time_logger = TimeLogger()

    difference_indexes, difference_values = differences
    mean = np.mean(difference_values)
    std_deviation = np.std(difference_values)
    left_bound_3_sigma = mean - 5 * std_deviation
    right_bound_3_sigma = mean + 5 * std_deviation

    anomalies = []
    for i, x in enumerate(difference_values):
        if x < left_bound_3_sigma or x > right_bound_3_sigma:
            anomalies.append((difference_indexes[i], difference_values[i]))

    print('3-sigma anomaly selection finished its work. Time: ' +
          str(three_sigma_time_logger.finish()))

    return anomalies
Beispiel #6
0
def binary_write(differences, features_number, output_file):
    chunking_time_logger = TimeLogger()
    differences = differences.flatten('F')
    differences = np.append(differences, features_number)
    differences = struct.pack('=%df' % differences.size, *differences)

    chunk_size = 10000000
    difference_chunks = funcy.chunks(chunk_size, differences)
    print('Chunking finished. Time: ' + str(chunking_time_logger.finish()))

    chunk_counter = 1
    for difference_chunk in difference_chunks:
        with open(output_file, 'ab') as f:
            difference_chunk_time_logger = TimeLogger()
            f.write(difference_chunk)
            print('Write difference chunk ' + str(chunk_counter) + ' is done. Time: ' +
                  str(difference_chunk_time_logger.finish()))
            chunk_counter += 1
Beispiel #7
0
parser.add_argument('--files_map_file', nargs=1, type=str,
                    help='path to file with map dataset indexes and ast file paths')
parser.add_argument('--anomalies_output_file', '-o', nargs=1, type=str,
                    help='path to file, which will contain anomaly list (as paths to AST code snippets)')

args = parser.parse_args()
stage = args.stage

if stage == 'autoencoding':
    dataset_file = args.dataset[0]
    split_percent = args.split_percent[0]
    encoding_dim_percent = args.encoding_dim_percent[0]
    output_file = args.differences_output_file[0]
    use_dbscan = args.use_dbscan

    total_time_logger = TimeLogger()

    autoencoding(dataset_file, split_percent, encoding_dim_percent, output_file, full_differences=use_dbscan)

    print('==============================')
    print('Autoencoder finished its work. Time: ' + str(total_time_logger.finish()))

elif stage == 'anomaly_selection':
    differences_file = args.differences_file[0]
    files_map_file = args.files_map_file[0]
    anomalies_output_file = args.anomalies_output_file[0]
    use_dbscan = args.use_dbscan

    total_time_logger = TimeLogger()

    anomalies_number =\
Beispiel #8
0
def autoencoding(dataset_file, split_percent, encoding_dim_percent, output_file=None, full_differences=None):
    time_logger = TimeLogger()
    data = DatasetLoader(dataset_file).load(split_percent=split_percent)
    (_, _, features_number) = data
    encoding_dim = math.ceil(features_number * encoding_dim_percent)
    print('Dataset loaded. Time: ' + str(time_logger.finish()))

    time_logger = TimeLogger()
    autoencoder = Autoencoder(features_number, encoding_dim, data)
    autoencoder.print_model_summary()
    autoencoder.fit()
    print('Autoencoder fit finished. Time: ' + str(time_logger.finish()))

    time_logger = TimeLogger()
    autoencoder.predict()
    print('Autoencoder predict finished. Time: ' + str(time_logger.finish()))

    time_logger = TimeLogger()
    differences = autoencoder.calc_differences(full_differences)
    print('Calculate differences finished. Time: ' + str(time_logger.finish()))

    if not full_differences:
        differences = sorted(enumerate(differences), key=lambda tup: tup[1], reverse=True)

    if not output_file:
        return differences

    time_logger = TimeLogger()

    if full_differences:
        binary_write(differences, features_number, output_file)
    else:
        ascii_write(differences, output_file)

    print('Write differences finished. Time: ' + str(time_logger.finish()))
def binary_read(differences_file):
    differences_read_logger = TimeLogger()
    with open(differences_file, 'rb') as f:
        buffer = f.read(4)
        differences = []
        chunk_counter = 0
        log_write_per_chunk_number = 10000000
        chunks_time_logger = TimeLogger()
        while buffer:
            differences.append(struct.unpack('=f', buffer))
            buffer = f.read(4)
            if (chunk_counter + 1) % log_write_per_chunk_number == 0:
                print(
                    str(chunk_counter + 1) +
                    ' chunks is read and unpacked. Time: ' +
                    str(chunks_time_logger.finish()))
                chunks_time_logger = TimeLogger()
            chunk_counter += 1
        print(
            str(chunk_counter) + ' chunks is read and unpacked. Time: ' +
            str(chunks_time_logger.finish()))
    print('Differences read finished. Time: ' +
          str(differences_read_logger.finish()))

    transformation_logger = TimeLogger()
    differences = np.array(differences)
    features_number = int(differences[-1])
    differences = differences[:-1].reshape(
        int(len(differences) / features_number), features_number)
    print('Differences transformation finished. Time: ' +
          str(transformation_logger.finish()))

    return differences
parser = argparse.ArgumentParser()
parser.add_argument('--keyword',
                    '-k',
                    nargs=1,
                    type=str,
                    help='keyword for search on Github')
parser.add_argument('--token', '-t', nargs=1, type=str, help='Github token')
parser.add_argument('--directory',
                    '-d',
                    nargs=1,
                    type=str,
                    help='directory for saving Kotlin source code files')

args = parser.parse_args()
keyword = args.keyword[0]
token = args.token[0]
directory = args.directory[0]

LOG_FILE = 'log.txt'

github = GithubCodeCollector(token)

config = {'log_file': LOG_FILE, 'keyword': keyword, 'directory': directory}

time_logger = TimeLogger()

# code_search(github, config)
code_by_repo_search(github, config)

time_logger.finish(task_name='Code collection')