def __load_row_acnet_file(infile, gold_permission, stemmer): print("Loading row {} ".format(infile)) #read training data print("Reading Train Sentences") tagged_train_file = pd.read_csv(infile) train_sententence_reports = [] acnet_map = { "RECORD_AUDIO": "MICROPHONE", "READ_CONTACTS": "CONTACTS", "READ_CALENDAR": "CALENDAR", "ACCESS_FINE_LOCATION": "LOCATION", "CAMERA": "CAMERA", "READ_SMS": "SMS", "READ_CALL_LOGS": "CALL_LOG", "CALL_PHONE": "PHONE", "WRITE_SETTINGS": "SETTINGS", "GET_TASKS": "TASKS" } for idx, row in tagged_train_file.iterrows(): app_id = int(row["app_id"]) sentence = row["sentence"] mark = None if row[acnet_map[gold_permission]] is 1: mark = True else: mark = False sentence_report = SentenceReport(app_id, sentence, mark) sentence_report.preprocessed_sentence = " ".join( NLPUtils.preprocess_sentence(sentence_report.sentence, stemmer)) sentence_report.all_phrases = __find_all_possible_phrases( sentence_report.preprocessed_sentence, sentence_only=True) train_sententence_reports.append(sentence_report) print("Loading completed") return train_sententence_reports
def __load_row_whyper_file(infile, stemmer): print("Loading row {}".format(infile)) tagged_test_file = pd.read_csv(infile) test_sentence_reports = [] #read and preprocess whyper sentences print("Reading Test Sentences") for idx, row in tagged_test_file.iterrows(): #TODO : UPDATE FOR APP ID sentence = str(row["Sentences"]) if not sentence.startswith("#"): mark = None if "Manually Marked" in row: if row["Manually Marked"] == 1: mark = True else: mark = False else: raise Exception("Manually Marked label does not exist") sentence_report = SentenceReport(sentence, mark) sentence_report.preprocessed_sentence = " ".join( NLPUtils.preprocess_sentence(sentence_report.sentence, stemmer)) sentence_report.all_phrases = __find_all_possible_phrases( sentence_report.preprocessed_sentence, sentence_only=True) test_sentence_reports.append(sentence_report) print("Loading completed") return test_sentence_reports
def load_row_acnet(infile, gold_permission, stemmer, embeddings): print("Loading row {} ".format(infile)) # read training data print("Reading Train Sentences") tagged_train_file = pd.read_csv(infile) train_sententence_reports = [] acnet_map = { "RECORD_AUDIO": "MICROPHONE", "READ_CONTACTS": "CONTACTS", "READ_CALENDAR": "CALENDAR", "ACCESS_FINE_LOCATION": "LOCATION", "CAMERA": "CAMERA", "READ_SMS": "SMS", "READ_CALL_LOGS": "CALL_LOG", "CALL_PHONE": "PHONE", "WRITE_SETTINGS": "SETTINGS", "GET_TASKS": "TASKS", } for idx, row in tagged_train_file.iterrows(): app_id = row["app_id"] sentence = row["sentence"] mark = row[acnet_map[gold_permission]] sentence_report = SentenceReport(app_id, sentence, mark) preprocessed = NLPUtils.preprocess_sentence(sentence_report.sentence, stemmer) sentence_report.preprocessed_sentence = [ word for word in preprocessed if word in embeddings ] train_sententence_reports.append(sentence_report) print("Loading completed") return train_sententence_reports
def predict_descriptions(lst): for item in lst: description = item['description'] if not pd.isna(description): sentences = nltk.sent_tokenize(description) if len(sentences) > 0: sentence = sentences[0] sent = NLPUtils.preprocess_sentence(sentence, args.stemmer) prediction = predict_raw_sentence(model, sent) item["prediction"] = prediction else: item["prediction"] = -1
def calculate_freqs(infile, stemmer, embeddings): tagged_train_file = pd.read_csv(infile) vocab_freq = {} for idx, row in tagged_train_file.iterrows(): app_id = row["app_id"] sentence = row["sentence"] preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer) for token in preprocessed: if token not in vocab_freq: vocab_freq[token] = 0 vocab_freq[token] += 1 return vocab_freq
def predict_class_and_method_signatures(lst): for item in lst: all_tokens = [] class_name = item['class'] if not pd.isna(class_name): tokens = process_class_name(class_name) all_tokens.extend(tokens) if 'method' in item: method_name = item['method'] if not pd.isna(method_name): tokens = process_method_name(method_name) all_tokens.extend(tokens) signature = " ".join(all_tokens) sent = NLPUtils.preprocess_sentence(signature, args.stemmer) prediction = predict_raw_sentence(model, sent) item["prediction"] = prediction
def load_row_document_acnet_file(infile, stemmer, embeddings, filtered_words): print("Loading row {} ".format(infile)) # read training data print("Reading Train Sentences") tagged_train_file = pd.read_csv(infile) documents = [] acnet_map = { "RECORD_AUDIO": "MICROPHONE", "READ_CONTACTS": "CONTACTS", "READ_CALENDAR": "CALENDAR", "ACCESS_FINE_LOCATION": "LOCATION", "CAMERA": "CAMERA", "READ_SMS": "SMS", "READ_CALL_LOGS": "CALL_LOG", "CALL_PHONE": "PHONE", "WRITE_SETTINGS": "SETTINGS", "GET_TASKS": "TASKS", "STORAGE": "STORAGE", } for idx, row in tagged_train_file.iterrows(): app_id = row["app_id"] sentence = row["sentence"] if documents == []: # if it is the first document documents.append(DocumentReport(app_id)) elif documents[-1].app_id != app_id: # if it is a new document documents.append(DocumentReport(app_id)) for permission in acnet_map: if (permission not in documents[-1].permissions or row[acnet_map[permission]] == 1): documents[-1].permissions[permission] = row[ acnet_map[permission]] documents[-1].sentences.append(sentence) preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer) filtered = [] for word in preprocessed: if word in embeddings and word in filtered_words: filtered.append(word) documents[-1].preprocessed_sentences.append(filtered) print("Loading completed") return documents
def load_row_reviews(infile, stemmer, embeddings): print("Loading row {} ".format(infile)) reviews = {} tagged_train_file = pd.read_csv(infile) for idx, row in tagged_train_file.iterrows(): if idx != 0 and idx % 1000 == 0: print(idx) app_id, sentence, score = ( row["application_id"], row["review_sentence"], row["score"], ) if app_id and sentence and score: preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer) if len(preprocessed) != 0: review = Review(sentence, score) if app_id not in reviews: reviews[app_id] = [] review.preprocessed_sentence = [ word for word in preprocessed if word in embeddings ] reviews[app_id].append(review) return reviews
for method in methods: description = method.description sentences = nltk.sent_tokenize(description) if len(sentences) > 0: sentence = sentences[0] try: tree = parser.parse(sentence) except ValueError: continue data = {} data["description"] = {"str": sentence, "prediction": -1} data["vp"] = {"str": extract_vp(tree), "prediction": -1} sent = NLPUtils.preprocess_sentence(data["description"]["str"], args.stemmer) data["description"]["prediction"] = predict_raw_sentence(model, sent) if data["vp"]["str"] != "-": sent = NLPUtils.preprocess_sentence(data["vp"]["str"], args.stemmer) data["vp"]["prediction"] = predict_raw_sentence(model, sent) d = Description(data) method.descriptions.append(d) try: session.add(d) session.commit() except: print("Database Error.")