Example #1
0
def get_closest_docs(uri):
    #user_doc = requests.get(uri).text
    r = requests.get(uri)
    if r.status_code == 200:
        user_doc = r.text
        print("URI content length", len(user_doc))
        code, _ = separate_code_and_comments(user_doc, "user doc")
        normalized_code = normalize_text(code,
                                         remove_stop_words=False,
                                         only_letters=False,
                                         return_list=True)
        model.random.seed(0)
        user_vector = model.infer_vector(normalized_code)
        print("finding similar...")
        sys.stdout.flush()
        stored_urls = list()
        stored_vectors = list()
        for url in vectors:
            stored_urls.append(url)
            stored_vectors.append(vectors[url])
        pair_sims = cosine_similarity(user_vector.reshape(1, -1),
                                      stored_vectors)
        indices = (-pair_sims[0]).argsort()[:5]
        return [(stored_urls[index], round(float(pair_sims[0][index]), 2))
                for index in indices]
    else:
        print("URL returned status code", r.status_code)
        raise ValueError('URL error')
Example #2
0
def main(script_folder,topics,vocab_pickle_filename,model_pickle_filename,max_script_count,use_binary,n_jobs):

    # Retrieve existing vocabulary
    if vocab_pickle_filename is not None:
        vocab = pickle.load(open(vocab_pickle_filename, "rb"))
    else:
        logger.warning("Pickle file containing bag of words vocabulary required")
        quit()

    code_scripts_list = list()
    counter = 0

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                counter += 1
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) == 0:
                    continue
                else:
                    normalized_code = normalize_text(code, remove_stop_words=True, only_letters=False, return_list=False, remove_one_char_words=True)
                    code_scripts_list.append(normalized_code)

    lda_model = build_lda_model(code_scripts_list,topics,vocab,use_binary,n_jobs)

    #logger.info("Saving LDA model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(lda_model, open(model_pickle_filename, "wb"))
    logger.info("LDA model pickle file saved at %s" % model_pickle_filename)
def vectorize_code(code_vectors, model, code_urls, vector_file):
    # code_urls is a set, make it a list to allow skipping some entries
    code_urls_list = list(code_urls)
    # start at a point close to where the previous code_vectors file left off
    starting_point = len(code_vectors)
    for i in range(starting_point, len(code_urls_list)):
        url = code_urls_list[i]
        if i > starting_point and i % 5000 == 0:
            intermediate_save(code_vectors, vector_file)
        if requests.get(url).status_code == 200:
            try:
                code = requests.get(url).text
                parsed_code, _ = separate_code_and_comments(code, "code")
                normalized_code = normalize_text(parsed_code,
                                                 remove_stop_words=False,
                                                 only_letters=False,
                                                 return_list=True)
                if len(normalized_code) > 1:
                    model.random.seed(0)
                    vector = model.infer_vector(normalized_code)
                    code_vectors[url] = vector
                else:
                    print("Parsing resulted in empty list for", url)
                    continue
            except:
                print("Unexpected error:", sys.exc_info()[0])
                continue
        else:
            print("Error code {} for url: {}".format(
                requests.get(url).status_code, url))
    return code_vectors
def build_bow_script_vocabulary(script_folder,
                                max_script_count=10000,
                                max_vocab_size=5000,
                                min_word_count=2):
    '''
    Generates a dictionary of words to be used as the vocabulary in techniques that utilize bag of words.
    Args:
        script_folder (str): Folder location of corpus containing script files
        max_script_count (int): the maximum number of code scripts to process in the script_folder
        max_vocab_size (int): the maximum number of words to be used in the vocabulary (dimension of bag of words vector)
        min_word_count (int): a word will be included in vocabulary if it appears at least min_count times in the corpus
    Returns:
        words_ordered_by_count (list): a list of size equal or less than vocab_size that contains the most frequent
        normalized words in the corpus
    '''
    word_count = defaultdict(int)
    counter = 0

    # Read file contents, extract code, normalize contents and count resulting tokens
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                counter += 1
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(
                    parsed_json['content'], py_file)
                normalized_script = normalize_text(code,
                                                   remove_stop_words=True,
                                                   only_letters=False,
                                                   return_list=True,
                                                   remove_one_char_words=True)
                for token in normalized_script:
                    word_count[token] += 1
                if counter >= max_script_count: break

    # Determine descending order for library based on count and restricted by min_count threshold
    words_ordered_by_count = [
        i[0] for i in sorted(
            word_count.items(), key=lambda x: (x[1], x[0]), reverse=True)
        if i[1] > min_word_count
    ]

    # Trim the vocabulary to the requested vocab_size
    if len(words_ordered_by_count) >= max_vocab_size:
        words_ordered_by_count = words_ordered_by_count[:max_vocab_size]
    else:
        logger.warning("Only %d words were observed using max_script_count=%d, max_vocab_size=%d and min_word_count=%d" % \
                       (len(words_ordered_by_count),max_script_count, max_vocab_size,min_word_count))

    return words_ordered_by_count
def main(script_folder, output_folder, min_script_len, max_total_files,
         max_per_pkl):

    doc2vec_tagged_documents = list()
    counter = 0
    logger.info("retrieving files")
    just_started = True

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_total_files: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                if counter >= max_total_files: break
                if counter != 0 and counter % 50000 == 0:
                    logger.info("processed %d files" % counter)
                if not just_started and counter % max_per_pkl == 0:
                    logger.info(
                        "Saving pickle file of tagged documents for size %d",
                        max_per_pkl)
                    pickle.dump(
                        doc2vec_tagged_documents,
                        open(
                            os.path.join(output_folder,
                                         "training" + str(counter) + ".pkl"),
                            "wb"))
                    doc2vec_tagged_documents = list()
                    just_started = True
                parsed_json = json.loads(line)
                code, _ = separate_code_and_comments(parsed_json['content'],
                                                     py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code,
                                                    remove_stop_words=False,
                                                    only_letters=False,
                                                    return_list=True,
                                                    remove_one_char_words=True)
                    if len(tokenized_code) > 1:
                        doc2vec_tagged_documents.append(
                            doc2vec.TaggedDocument(tokenized_code, [counter]))
                        counter += 1
                        just_started = False

    logger.info("Saving final pickle file of tagged documents for size %d",
                max_per_pkl)
    pickle.dump(
        doc2vec_tagged_documents,
        open(os.path.join(output_folder, "training" + str(counter) + ".pkl"),
             "wb"))
def main(script_folder, model_pickle_filename, training_algorithm, num_cores,
         epochs, vector_size, window, min_count, alpha, max_script_count,
         min_script_len, negative):

    doc2vec_tagged_documents = list()
    counter = 0

    logger.info("retrieving files")

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        if counter % 100000 == 0: logger.info("processed %d files" % counter)
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(
                    parsed_json['content'], py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code,
                                                    remove_stop_words=False,
                                                    only_letters=False,
                                                    return_list=True,
                                                    remove_one_char_words=True)
                    doc2vec_tagged_documents.append(
                        doc2vec.TaggedDocument(tokenized_code, [counter]))
                    counter += 1

    doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,
                                        training_algorithm, num_cores, epochs,
                                        vector_size, window, min_count, alpha,
                                        negative)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
    # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
    # If keep_inference is set to false, infer_vector on a new document is no longer possible
    doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False,
                                                 keep_inference=True)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own  method for saving/loading models
    # doc2vec_model.save(model_pickle_filename)
    # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)

    #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
    logger.info("doc2vec model pickle file saved at %s" %
                model_pickle_filename)
def vectorize_code(current_script_fullpath, model, remove_comments):
    with open(current_script_fullpath, "r") as input:
        code = input.read()
    if remove_comments:
        parsed_code, _ = separate_code_and_comments(code, "code")
    else:
        parsed_code = code
    normalized_code = normalize_text(parsed_code, remove_stop_words=False, only_letters=False, return_list=True)
    if len(normalized_code) > 1:
        model.random.seed(0)
        return model.infer_vector(normalized_code)
    else:
        print("Warning - Parsing resulted in empty list for", current_script_fullpath)
        return None
Example #8
0
def get_closest_docs(uri):
    user_doc = requests.get(uri).text
    code, _ = separate_code_and_comments(user_doc, "user doc")
    normalized_code = normalize_text(code,
                                     remove_stop_words=False,
                                     only_letters=False,
                                     return_list=True)
    model.random.seed(0)
    user_vector = model.infer_vector(normalized_code)
    print("finding similar...")
    stored_urls = list()
    stored_vectors = list()
    for url in vectors:
        stored_urls.append(url)
        stored_vectors.append(vectors[url])
    pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
    indices = (-pair_sims[0]).argsort()[:5]
    return [(stored_urls[index], round(float(pair_sims[0][index]), 2))
            for index in indices]
Example #9
0
def main(data_path, num_cores, top_n_param, vectorizer):
    global raw
    global features
    global q

    # Patch for error thrown by score_performance on declaration of top_n
    global top_n
    top_n = top_n_param

    raw = read_data(data_path)
    """
    # Remove items where competition IDs are in:
    # PyCon2015 Tutorial (#4353)
    # Word2Vec NLP Tutorial (#3971)
    filter_comp_ids = ["4353", "3971"]
    idxs_to_remove = set()
    for idx, r in enumerate(raw):
        if r["CompetitionId"] in filter_comp_ids:
            idxs_to_remove.add(idx)
    raw = [r for idx, r in enumerate(raw) if idx not in idxs_to_remove]
    """
    """
    # Take a random sample from raw.
    import random
    raw = random.sample(raw, 2000)
    """

    # Strip out comments and add to scripts if it has code; otherwise remove it from raw list
    scripts = list()
    for index, script in list(enumerate(raw)):
        code, _ = separate_code_and_comments(script["ScriptContent"],
                                             script["ScriptTitle"])
        if len(code) > 0:
            scripts.append(code)
        else:
            raw.pop(index)
    #scripts = [script["ScriptContent"] for script in raw]

    # Choose vectorizer
    print("Vectorizing documents...")
    #vectorizer.vectorizer.fit(scripts)
    features = vectorizer.vectorize_multi(scripts)
    features_dense = features.todense() if issparse(features) else features
    p = Pool(num_cores, q_init, [q])
    print("Calculating pairwise similarities + scores...")
    for _ in tqdm.tqdm(p.imap_unordered(score_performance,
                                        list(enumerate(features_dense))),
                       total=len(features_dense)):
        pass

    score_top_1 = 0
    score_top_n_any = 0
    score_top_n_all = 0

    while not q.empty():
        top_1, top_n_any, top_n_all = q.get()

        score_top_1 += top_1
        score_top_n_any += top_n_any
        score_top_n_all += top_n_all

    top_1_accuracy = score_top_1 / float(len(raw))
    top_n_any_accuracy = score_top_n_any / float(len(raw))
    top_n_all_accuracy = score_top_n_all / float(len(raw))

    print("Top 1: %s" % top_1_accuracy)
    print("Top N (Any): %s" % top_n_any_accuracy)
    print("Top N (All): %s" % top_n_all_accuracy)
    print("(N = %s)" % top_n)

    return {
        "top_1_accuracy": top_1_accuracy,
        "top_n_any_accuracy": top_n_any_accuracy,
        "top_n_all_accuracy": top_n_all_accuracy,
        "top_n": top_n
    }
Example #10
0
def main(args):

    # Open input and output files
    csv_file = open(args.input_file, 'r')
    json_file = open(args.output_file, 'w')

    # Check if user wants to customize csv field order
    if not args.field_order_file:
        field_names = ["ScriptProjectId","ScriptVersionId","AuthorUserId","UserDisplayName","CompetitionId","CompetitionName","ScriptTitle","ScriptContent"]
    else:
        with open(args.field_order_file, 'r') as _order_file:
            field_order_reader = csv.reader(order_file)
            for row in field_order_reader:
                field_names = row
                continue

    # Address csv error regarding fields that exceed default size limit
    # Adapted from Stack Overflow post by user1251007
    maxInt = sys.maxsize
    decrement = True

    while decrement:
        # decrease the maxInt value by factor 10
        # as long as the OverflowError occurs.

        decrement = False
        try:
            csv.field_size_limit(maxInt)
        except OverflowError:
            maxInt = int(maxInt/10)
            decrement = True

    # Read CSV and Write the JSON to output_file after doing some preprocessing
    reader = csv.DictReader(csv_file, field_names)

    files = 0
    parsed_competitions = defaultdict(list)
    logger.info("Processing csv file...")

    for row in reader:
        files+=1
        # Remove very short scripts based on command line arguments
        script_len = len(separate_code_and_comments(row['ScriptContent'],row['ScriptTitle'])[0])
        if script_len<args.min_script_len:
            continue
        # Remove meta kaggle scripts labeled as python that are probably R
        if row['ScriptContent'].find("<-")!=-1 and row['ScriptContent'].find("library(")!=-1:
            continue
        # Remove Kaggle competition name from the script content to allow model testing on competitions
        if 'CompetitionName' in row and 'ScriptContent' in row:
            row['ScriptContent'].replace(row['CompetitionName']," ")
            row['ScriptContent'].replace(row['CompetitionName'].lower(), " ")

        parsed_competitions[row['CompetitionId']].append(row)

    submissions_deduped = list()
    logger.info("Removing duplicates...")

    # Iterate over competitions to remove duplicates and near duplicates
    for competition in parsed_competitions:
        counter = 0
        submissions = parsed_competitions[competition]

        # Pair-wise SequenceMatcher comparison of ScriptContent
        for i in range(len(submissions)):
            for j in range(len(submissions)):
                if i!=j and SequenceMatcher(None, submissions[i]['ScriptContent'].lower(), \
                        submissions[j]['ScriptContent'].lower()).ratio() > args.duplicate_threshold:
                    submissions[i]['ScriptContent'] = ""
                    counter+=1
                    break
        remove_empties = [x for x in submissions if x['ScriptContent']!=""]
        logger.info("%d duplicates removed from %d submissions in competition %s" % (counter,len(submissions),competition))
        
        # Ensure competition has at least ten entries for future comparison
        if len(remove_empties)>=10:
            for item in remove_empties:
                submissions_deduped.append(item)
        else:
            logger.warning("Competition %s has too few remaining submissions at threshold %f" % (competition,args.duplicate_threshold))

    # Build a custom namedtuple to integrate into pyminifer argparse command line methods
    if args.minimize or args.obfuscate:
        options_tuple = namedtuple("options_tuple", ["tabs", "minimize", "obfuscate", "replacement_length"])
        options = options_tuple(False, args.minimize, args.obfuscate, 1)


    errors = 0
    written = 0

    for row in submissions_deduped:

        # Minimize size of python script if set in args
        if args.minimize or args.obfuscate:
            try:
                tokens = token_utils.listified_tokenizer(row['ScriptContent'])
                source = minification.minify(tokens,options)
                tokens = token_utils.listified_tokenizer(source)

                # Obsfuscate python script
                if args.obfuscate:
                    table = [{}]
                    module = row['ScriptTitle']
                    name_generator = obfuscate.obfuscation_machine(identifier_length=int(options.replacement_length))
                    obfuscate.obfuscate(module, tokens, options, name_generator=name_generator, table=table)

                # Convert back to text
                result = ''
                result += token_utils.untokenize(tokens)
                row['ScriptContent'] = result

            except Exception as e:
                # logger.info("%s in %s; continuing" % (e.__class__.__name__,row['ScriptTitle']))
                errors+=1
                continue

        written+=1
        json.dump(row, json_file)
        json_file.write('\n')

    logger.info("Total files reviewed: %d" % files)
    if args.minimize or args.obfuscate:
        logger.info("File that failed pyminifier minimization/obfuscation parsing: %d" % errors)
    logger.info("Files successfully parsed to json: %d" % written)
    csv_file.close()
    json_file.close()