Ejemplo n.º 1
0
def __load_row_acnet_file(infile, gold_permission, stemmer):
    print("Loading row {} ".format(infile))
    #read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    train_sententence_reports = []
    acnet_map = {
        "RECORD_AUDIO": "MICROPHONE",
        "READ_CONTACTS": "CONTACTS",
        "READ_CALENDAR": "CALENDAR",
        "ACCESS_FINE_LOCATION": "LOCATION",
        "CAMERA": "CAMERA",
        "READ_SMS": "SMS",
        "READ_CALL_LOGS": "CALL_LOG",
        "CALL_PHONE": "PHONE",
        "WRITE_SETTINGS": "SETTINGS",
        "GET_TASKS": "TASKS"
    }
    for idx, row in tagged_train_file.iterrows():
        app_id = int(row["app_id"])
        sentence = row["sentence"]
        mark = None
        if row[acnet_map[gold_permission]] is 1:
            mark = True
        else:
            mark = False
        sentence_report = SentenceReport(app_id, sentence, mark)
        sentence_report.preprocessed_sentence = " ".join(
            NLPUtils.preprocess_sentence(sentence_report.sentence, stemmer))
        sentence_report.all_phrases = __find_all_possible_phrases(
            sentence_report.preprocessed_sentence, sentence_only=True)
        train_sententence_reports.append(sentence_report)
    print("Loading completed")
    return train_sententence_reports
Ejemplo n.º 2
0
def __load_row_whyper_file(infile, stemmer):
    print("Loading row {}".format(infile))
    tagged_test_file = pd.read_csv(infile)
    test_sentence_reports = []

    #read and preprocess whyper sentences
    print("Reading Test Sentences")
    for idx, row in tagged_test_file.iterrows():
        #TODO : UPDATE FOR APP ID
        sentence = str(row["Sentences"])
        if not sentence.startswith("#"):
            mark = None
            if "Manually Marked" in row:
                if row["Manually Marked"] == 1:
                    mark = True
                else:
                    mark = False
            else:
                raise Exception("Manually Marked label does not exist")
            sentence_report = SentenceReport(sentence, mark)
            sentence_report.preprocessed_sentence = " ".join(
                NLPUtils.preprocess_sentence(sentence_report.sentence,
                                             stemmer))
            sentence_report.all_phrases = __find_all_possible_phrases(
                sentence_report.preprocessed_sentence, sentence_only=True)
            test_sentence_reports.append(sentence_report)
    print("Loading completed")
    return test_sentence_reports
def load_row_acnet(infile, gold_permission, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    # read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    train_sententence_reports = []
    acnet_map = {
        "RECORD_AUDIO": "MICROPHONE",
        "READ_CONTACTS": "CONTACTS",
        "READ_CALENDAR": "CALENDAR",
        "ACCESS_FINE_LOCATION": "LOCATION",
        "CAMERA": "CAMERA",
        "READ_SMS": "SMS",
        "READ_CALL_LOGS": "CALL_LOG",
        "CALL_PHONE": "PHONE",
        "WRITE_SETTINGS": "SETTINGS",
        "GET_TASKS": "TASKS",
    }
    for idx, row in tagged_train_file.iterrows():
        app_id = row["app_id"]
        sentence = row["sentence"]
        mark = row[acnet_map[gold_permission]]
        sentence_report = SentenceReport(app_id, sentence, mark)
        preprocessed = NLPUtils.preprocess_sentence(sentence_report.sentence,
                                                    stemmer)
        sentence_report.preprocessed_sentence = [
            word for word in preprocessed if word in embeddings
        ]
        train_sententence_reports.append(sentence_report)
    print("Loading completed")
    return train_sententence_reports
Ejemplo n.º 4
0
def predict_descriptions(lst):
    for item in lst:
        description = item['description']
        if not pd.isna(description):
            sentences = nltk.sent_tokenize(description)
            if len(sentences) > 0:
                sentence = sentences[0]
                sent = NLPUtils.preprocess_sentence(sentence, args.stemmer)
                prediction = predict_raw_sentence(model, sent)
                item["prediction"] = prediction
            else:
                item["prediction"] = -1
Ejemplo n.º 5
0
def calculate_freqs(infile, stemmer, embeddings):
    tagged_train_file = pd.read_csv(infile)
    vocab_freq = {}
    for idx, row in tagged_train_file.iterrows():
        app_id = row["app_id"]
        sentence = row["sentence"]

        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
        for token in preprocessed:
            if token not in vocab_freq:
                vocab_freq[token] = 0
            vocab_freq[token] += 1
    return vocab_freq
Ejemplo n.º 6
0
def predict_class_and_method_signatures(lst):
    for item in lst:
        all_tokens = []
        class_name = item['class']
        if not pd.isna(class_name):
            tokens = process_class_name(class_name)
            all_tokens.extend(tokens)
        if 'method' in item:
            method_name = item['method']
            if not pd.isna(method_name):
                tokens = process_method_name(method_name)
                all_tokens.extend(tokens)
        signature = " ".join(all_tokens)
        sent = NLPUtils.preprocess_sentence(signature, args.stemmer)
        prediction = predict_raw_sentence(model, sent)
        item["prediction"] = prediction
Ejemplo n.º 7
0
def load_row_document_acnet_file(infile, stemmer, embeddings, filtered_words):
    print("Loading row {} ".format(infile))
    # read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    documents = []
    acnet_map = {
        "RECORD_AUDIO": "MICROPHONE",
        "READ_CONTACTS": "CONTACTS",
        "READ_CALENDAR": "CALENDAR",
        "ACCESS_FINE_LOCATION": "LOCATION",
        "CAMERA": "CAMERA",
        "READ_SMS": "SMS",
        "READ_CALL_LOGS": "CALL_LOG",
        "CALL_PHONE": "PHONE",
        "WRITE_SETTINGS": "SETTINGS",
        "GET_TASKS": "TASKS",
        "STORAGE": "STORAGE",
    }

    for idx, row in tagged_train_file.iterrows():
        app_id = row["app_id"]
        sentence = row["sentence"]

        if documents == []:  # if it is the first document
            documents.append(DocumentReport(app_id))
        elif documents[-1].app_id != app_id:  # if it is a new document
            documents.append(DocumentReport(app_id))

        for permission in acnet_map:
            if (permission not in documents[-1].permissions
                    or row[acnet_map[permission]] == 1):
                documents[-1].permissions[permission] = row[
                    acnet_map[permission]]

        documents[-1].sentences.append(sentence)
        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)

        filtered = []
        for word in preprocessed:
            if word in embeddings and word in filtered_words:
                filtered.append(word)
        documents[-1].preprocessed_sentences.append(filtered)

    print("Loading completed")
    return documents
Ejemplo n.º 8
0
def load_row_reviews(infile, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    reviews = {}
    tagged_train_file = pd.read_csv(infile)
    for idx, row in tagged_train_file.iterrows():
        if idx != 0 and idx % 1000 == 0:
            print(idx)
        app_id, sentence, score = (
            row["application_id"],
            row["review_sentence"],
            row["score"],
        )
        if app_id and sentence and score:
            preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
            if len(preprocessed) != 0:
                review = Review(sentence, score)
                if app_id not in reviews:
                    reviews[app_id] = []
                review.preprocessed_sentence = [
                    word for word in preprocessed if word in embeddings
                ]
                reviews[app_id].append(review)
    return reviews
for method in methods:
    description = method.description
    sentences = nltk.sent_tokenize(description)
    if len(sentences) > 0:
        sentence = sentences[0]
        try:
            tree = parser.parse(sentence)
        except ValueError:
            continue

        data = {}
        data["description"] = {"str": sentence, "prediction": -1}
        data["vp"] = {"str": extract_vp(tree), "prediction": -1}

        sent = NLPUtils.preprocess_sentence(data["description"]["str"],
                                            args.stemmer)
        data["description"]["prediction"] = predict_raw_sentence(model, sent)

        if data["vp"]["str"] != "-":
            sent = NLPUtils.preprocess_sentence(data["vp"]["str"],
                                                args.stemmer)
            data["vp"]["prediction"] = predict_raw_sentence(model, sent)

        d = Description(data)
        method.descriptions.append(d)
        try:
            session.add(d)
            session.commit()

        except:
            print("Database Error.")