Example #1
0
 def __preprocess(self, sentence):
     sentence = self.__to_lower(sentence)
     text_wo_link = NLPUtils.remove_hyperlinks(sentence)
     tokens = []
     try:
         tokens = NLPUtils.word_tokenization(text_wo_link)
         tokens = [NLPUtils.punctuation_removal(token) for token in tokens]
         tokens = NLPUtils.stopword_elimination(tokens)
         tokens = NLPUtils.nonalpha_removal(tokens)
     except AssertionError:
         print("Phrase '{}' cannot be preprocessed".format(sentence))
     return " ".join(tokens)
Example #2
0
def __load_row_acnet_file(infile, gold_permission, stemmer):
    print("Loading row {} ".format(infile))
    #read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    train_sententence_reports = []
    acnet_map = {
        "RECORD_AUDIO": "MICROPHONE",
        "READ_CONTACTS": "CONTACTS",
        "READ_CALENDAR": "CALENDAR",
        "ACCESS_FINE_LOCATION": "LOCATION",
        "CAMERA": "CAMERA",
        "READ_SMS": "SMS",
        "READ_CALL_LOGS": "CALL_LOG",
        "CALL_PHONE": "PHONE",
        "WRITE_SETTINGS": "SETTINGS",
        "GET_TASKS": "TASKS"
    }
    for idx, row in tagged_train_file.iterrows():
        app_id = int(row["app_id"])
        sentence = row["sentence"]
        mark = None
        if row[acnet_map[gold_permission]] is 1:
            mark = True
        else:
            mark = False
        sentence_report = SentenceReport(app_id, sentence, mark)
        sentence_report.preprocessed_sentence = " ".join(
            NLPUtils.preprocess_sentence(sentence_report.sentence, stemmer))
        sentence_report.all_phrases = __find_all_possible_phrases(
            sentence_report.preprocessed_sentence, sentence_only=True)
        train_sententence_reports.append(sentence_report)
    print("Loading completed")
    return train_sententence_reports
Example #3
0
def __load_row_whyper_file(infile, stemmer):
    print("Loading row {}".format(infile))
    tagged_test_file = pd.read_csv(infile)
    test_sentence_reports = []

    #read and preprocess whyper sentences
    print("Reading Test Sentences")
    for idx, row in tagged_test_file.iterrows():
        #TODO : UPDATE FOR APP ID
        sentence = str(row["Sentences"])
        if not sentence.startswith("#"):
            mark = None
            if "Manually Marked" in row:
                if row["Manually Marked"] == 1:
                    mark = True
                else:
                    mark = False
            else:
                raise Exception("Manually Marked label does not exist")
            sentence_report = SentenceReport(sentence, mark)
            sentence_report.preprocessed_sentence = " ".join(
                NLPUtils.preprocess_sentence(sentence_report.sentence,
                                             stemmer))
            sentence_report.all_phrases = __find_all_possible_phrases(
                sentence_report.preprocessed_sentence, sentence_only=True)
            test_sentence_reports.append(sentence_report)
    print("Loading completed")
    return test_sentence_reports
def load_row_acnet(infile, gold_permission, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    # read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    train_sententence_reports = []
    acnet_map = {
        "RECORD_AUDIO": "MICROPHONE",
        "READ_CONTACTS": "CONTACTS",
        "READ_CALENDAR": "CALENDAR",
        "ACCESS_FINE_LOCATION": "LOCATION",
        "CAMERA": "CAMERA",
        "READ_SMS": "SMS",
        "READ_CALL_LOGS": "CALL_LOG",
        "CALL_PHONE": "PHONE",
        "WRITE_SETTINGS": "SETTINGS",
        "GET_TASKS": "TASKS",
    }
    for idx, row in tagged_train_file.iterrows():
        app_id = row["app_id"]
        sentence = row["sentence"]
        mark = row[acnet_map[gold_permission]]
        sentence_report = SentenceReport(app_id, sentence, mark)
        preprocessed = NLPUtils.preprocess_sentence(sentence_report.sentence,
                                                    stemmer)
        sentence_report.preprocessed_sentence = [
            word for word in preprocessed if word in embeddings
        ]
        train_sententence_reports.append(sentence_report)
    print("Loading completed")
    return train_sententence_reports
Example #5
0
def predict_descriptions(lst):
    for item in lst:
        description = item['description']
        if not pd.isna(description):
            sentences = nltk.sent_tokenize(description)
            if len(sentences) > 0:
                sentence = sentences[0]
                sent = NLPUtils.preprocess_sentence(sentence, args.stemmer)
                prediction = predict_raw_sentence(model, sent)
                item["prediction"] = prediction
            else:
                item["prediction"] = -1
Example #6
0
def calculate_freqs(infile, stemmer, embeddings):
    tagged_train_file = pd.read_csv(infile)
    vocab_freq = {}
    for idx, row in tagged_train_file.iterrows():
        app_id = row["app_id"]
        sentence = row["sentence"]

        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
        for token in preprocessed:
            if token not in vocab_freq:
                vocab_freq[token] = 0
            vocab_freq[token] += 1
    return vocab_freq
Example #7
0
def predict_class_and_method_signatures(lst):
    for item in lst:
        all_tokens = []
        class_name = item['class']
        if not pd.isna(class_name):
            tokens = process_class_name(class_name)
            all_tokens.extend(tokens)
        if 'method' in item:
            method_name = item['method']
            if not pd.isna(method_name):
                tokens = process_method_name(method_name)
                all_tokens.extend(tokens)
        signature = " ".join(all_tokens)
        sent = NLPUtils.preprocess_sentence(signature, args.stemmer)
        prediction = predict_raw_sentence(model, sent)
        item["prediction"] = prediction
Example #8
0
def clean_play_store_data(file_path):
    """TODO"""
    number_of_apps = 0
    data = {}
    with open(file_path) as stream:
        reader = csv.reader(stream)
        next(reader)
        start_time = time.time()
        for row in reader:
            number_of_apps += 1
            app_id = row[0]
            text = row[1]
            data[app_id] = []
            for sentence in NLPUtils.sentence_tokenization(text):
                data[app_id].append(sentence)
    return data
Example #9
0
def load_row_document_acnet_file(infile, stemmer, embeddings, filtered_words):
    print("Loading row {} ".format(infile))
    # read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    documents = []
    acnet_map = {
        "RECORD_AUDIO": "MICROPHONE",
        "READ_CONTACTS": "CONTACTS",
        "READ_CALENDAR": "CALENDAR",
        "ACCESS_FINE_LOCATION": "LOCATION",
        "CAMERA": "CAMERA",
        "READ_SMS": "SMS",
        "READ_CALL_LOGS": "CALL_LOG",
        "CALL_PHONE": "PHONE",
        "WRITE_SETTINGS": "SETTINGS",
        "GET_TASKS": "TASKS",
        "STORAGE": "STORAGE",
    }

    for idx, row in tagged_train_file.iterrows():
        app_id = row["app_id"]
        sentence = row["sentence"]

        if documents == []:  # if it is the first document
            documents.append(DocumentReport(app_id))
        elif documents[-1].app_id != app_id:  # if it is a new document
            documents.append(DocumentReport(app_id))

        for permission in acnet_map:
            if (permission not in documents[-1].permissions
                    or row[acnet_map[permission]] == 1):
                documents[-1].permissions[permission] = row[
                    acnet_map[permission]]

        documents[-1].sentences.append(sentence)
        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)

        filtered = []
        for word in preprocessed:
            if word in embeddings and word in filtered_words:
                filtered.append(word)
        documents[-1].preprocessed_sentences.append(filtered)

    print("Loading completed")
    return documents
Example #10
0
def process_raw_dataset(file_path, out_file):
    """TODO"""
    number_of_apps = 0
    with open(file_path) as stream:
        with open(out_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

            reader = csv.reader(stream)
            header = next(reader)
            writer.writerow(header)
            start_time = time.time()
            for row in reader:
                if number_of_apps % 100 == 0:
                    elapsed_time = time.time() - start_time
                    print("Number of apps processed is {}".format(number_of_apps))
                    print("Elapsed time up to now is {}".format(elapsed_time))

                number_of_apps += 1
                text = row[1]
                try:
                    sentences = []
                    if langdetect.detect(text) == u'en':
                        for sentence in NLPUtils.sentence_tokenization(text):
                            sentence = NLPUtils.remove_hyperlinks(sentence)
                            sentence = sentence.lower()
                            if sentence:
                                tokens = NLPUtils.word_tokenization(sentence)
                                tokens = [NLPUtils.punctuation_removal(token) for token in tokens]
                                tokens = NLPUtils.stopword_elimination(tokens)
                                tokens = NLPUtils.nonalpha_removal(tokens)
                                if tokens:
                                    sentence = " ".join(tokens)
                                    sentence = sentence.rstrip()
                                    if sentence != "":
                                        sentences.append(sentence.rstrip())
                        if sentences:
                            writer.writerow([NLPUtils.punctuation_removal(row[0]),
                                            "%%".join(sentences),
                                            "%%".join(row[2].split(",")),
                                            row[3]])
                except Exception:
                    pass
Example #11
0
def load_row_reviews(infile, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    reviews = {}
    tagged_train_file = pd.read_csv(infile)
    for idx, row in tagged_train_file.iterrows():
        if idx != 0 and idx % 1000 == 0:
            print(idx)
        app_id, sentence, score = (
            row["application_id"],
            row["review_sentence"],
            row["score"],
        )
        if app_id and sentence and score:
            preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
            if len(preprocessed) != 0:
                review = Review(sentence, score)
                if app_id not in reviews:
                    reviews[app_id] = []
                review.preprocessed_sentence = [
                    word for word in preprocessed if word in embeddings
                ]
                reviews[app_id].append(review)
    return reviews
for method in methods:
    description = method.description
    sentences = nltk.sent_tokenize(description)
    if len(sentences) > 0:
        sentence = sentences[0]
        try:
            tree = parser.parse(sentence)
        except ValueError:
            continue

        data = {}
        data["description"] = {"str": sentence, "prediction": -1}
        data["vp"] = {"str": extract_vp(tree), "prediction": -1}

        sent = NLPUtils.preprocess_sentence(data["description"]["str"],
                                            args.stemmer)
        data["description"]["prediction"] = predict_raw_sentence(model, sent)

        if data["vp"]["str"] != "-":
            sent = NLPUtils.preprocess_sentence(data["vp"]["str"],
                                                args.stemmer)
            data["vp"]["prediction"] = predict_raw_sentence(model, sent)

        d = Description(data)
        method.descriptions.append(d)
        try:
            session.add(d)
            session.commit()

        except:
            print("Database Error.")