def even_distribution(data, bin_count): """Create even distribution by removing data from bins with higher datapoint count than the smallest bin""" if data is None: return [] min_timespent = min(data, key=lambda datapoint: datapoint[TIMESPENT_FIELD_KEY] )[TIMESPENT_FIELD_KEY] max_timespent = max(data, key=lambda datapoint: datapoint[TIMESPENT_FIELD_KEY] )[TIMESPENT_FIELD_KEY] timespent_range = max_timespent - min_timespent bins, bin_volumes = get_bins_and_volumes(data, bin_count, timespent_range) min_bin_volume = min(bin_volumes) print("Bin volumes:", *bin_volumes) evenly_distributed_data = [] for i, b in enumerate(bins): factor = min_bin_volume / bin_volumes[i] for j, d in enumerate(b): if round(j * factor) == round((j + 1) * factor): continue evenly_distributed_data.append(d) print( "%d (%.2f%%) of %d records were selected and an even distribution was created" % get_part_strings(len(evenly_distributed_data), len(data))) return evenly_distributed_data
def bing_search(query, bing_api_key): """Returns a list of URLs obtained in a Bing search query using Bing Web Search API v7 https://docs.microsoft.com/en-gb/rest/api/cognitiveservices/bing-web-api-v7-reference Arguments: query -- Bing search query, advanced operators may be used, see https://msdn.microsoft.com/library/ff795620.aspx bing_api_key -- Bing Web Search API key which can be obtained at https://azure.microsoft.com/en-us/services/cognitive-services/bing-web-search-api/ """ PAGE_SIZE = 50 results = set() totalEstimatedMatches = 0 page = 0 while page * PAGE_SIZE <= totalEstimatedMatches: payload = {'q': query, 'count': PAGE_SIZE, 'offset': page * PAGE_SIZE, 'responseFilter': 'Webpages'} headers = {'Ocp-Apim-Subscription-Key': bing_api_key} page = page + 1 try: response = requests.get(BING_SEARCH_URL, params=payload, headers=headers) except requests.exceptions.RequestException: print("Request exception, jump over page") continue if response.status_code != 200: print("Unsuccessful status code %d, jump over page" % response.status_code) continue try: json_response = response.json() except json.JSONDecodeError: print("Could not decode response or it didn't contain an URL, jump over page") continue if json_response == None or json_response.get("webPages") == None or json_response.get("webPages").get("value") == None: print("Couldn't get web pages from response") continue webpages = json_response.get("webPages").get("value") result = set([webpage.get("url") for webpage in webpages if webpage.get("url") != None]) results = results.union(result) totalEstimatedMatches = json_response.get("webPages").get("totalEstimatedMatches") totalPages = math.ceil(totalEstimatedMatches / PAGE_SIZE) print("%d (%.2f%%) of %d result pages processed" % get_part_strings(page, totalPages)) print("%d potential JIRA instances found" % len(results)) return results
def remove_unlabeled_datapoints(data): labeled_data = [ datapoint for datapoint in data if TIMESPENT_FIELD_KEY in datapoint ] if (len(labeled_data) != len(data)): print( "%d (%d%%) of %d datapoints were removed because they were unlabeled" % get_part_strings(len(data) - len(labeled_data), len(data))) return labeled_data
def google_search(keyword, google_api_key, cse_id): """Returns a list of URLs obtained in a Google search query using Google Custom Search Engine API https://developers.google.com/custom-search/json-api/v1/reference/cse/list The Custom Search Engine instance should be configured to search the whole Web as described in the first two steps at https://stackoverflow.com/a/37084643 Arguments: keyword -- Google search query, advanced operators may be used, see https://bynd.com/news-ideas/google-advanced-search-comprehensive-list-google-search-operators/ google_api_key -- Google API key, see https://developers.google.com/api-client-library/python/guide/aaa_apikeys cse_id -- Custom Search Engine ID, see https://cse.google.com/cse/ """ PAGE_SIZE = 10 service = build("customsearch", "v1", developerKey=google_api_key) results = set() total_results = 0 page = 0 while page * PAGE_SIZE <= total_results: if page % 5 == 0: time.sleep(5) try: res = service.cse().list(q=keyword, cx=cse_id, num=PAGE_SIZE, start=page * PAGE_SIZE + 1).execute() page = page + 1 total_results = int(res.get("searchInformation").get("totalResults")) except Exception as e: print("Exception, skip page") print(e) break try: items = res.get('items') except: continue for item in items: try: url = item.get("link") except: continue results.add(url) totalPages = math.ceil(total_results / PAGE_SIZE) print("%d (%.2f%%) of %d result pages processed" % get_part_strings(page, totalPages)) print("%d potential JIRA instances found" % len(results)) return results
def escape_short_texts(data, minimum_words): """Remove task with description length shorter than minimum_words""" filtered_data = [ datapoint for datapoint in data if word_count(datapoint.get(SUMMARY_FIELD_KEY, "")) + word_count(datapoint.get(DESCRIPTION_FIELD_KEY, "")) >= minimum_words ] print("%d (%.2f%%) of %d records were selected" % get_part_strings(len(filtered_data), len(data))) return filtered_data
def remove_small_projects(data, minimum_project_size): issue_counts = get_issue_counts(data) selected_projects = { issue_count[0] for issue_count in issue_counts if issue_count[1] >= minimum_project_size } print("%d (%.2f%%) of %d projects were selected" % get_part_strings(len(selected_projects), len(get_projects(data)))) return selected_projects
def filter_data_by_projects(data, selected_projects): if len(selected_projects) == 0: return selected_data = [ datapoint for datapoint in data if is_in_projects(datapoint, selected_projects) ] print("%d (%.2f%%) of %d datapoints selected" % get_part_strings(len(selected_data), len(data))) return selected_data
def remove_outliers(data, minimum_timespent_seconds, maximum_timespent_seconds): print( "Filtering out datapoints with time spent lower than %d seconds and higher than %d seconds" % (minimum_timespent_seconds, maximum_timespent_seconds)) filtered_data = [ datapoint for datapoint in data if datapoint[TIMESPENT_FIELD_KEY] >= minimum_timespent_seconds and datapoint[TIMESPENT_FIELD_KEY] <= maximum_timespent_seconds ] print( "%d (%.2f%%) of %d datapoints were selected for testing and training" % get_part_strings(len(filtered_data), len(data))) return filtered_data
def select_projects(data): print("Please select one or more of the following projects:") project_issue_counts = get_issue_counts([ datapoint for datapoint in data if datapoint.get(TIMESPENT_FIELD_KEY, None) is not None ]) for c in project_issue_counts: total_issue_count = sum([ 1 for datapoint in data if datapoint.get(PROJECT_FIELD_KEY) == c[0] ]) part_strings = get_part_strings(c[1], total_issue_count) print("%s - %d (%.2f%%) of %d issues are labeled" % (c[0], part_strings[0], part_strings[1], part_strings[2])) selected_projects = input("Selected datasets: ") selected_projects = selected_projects.replace(",", " ") selected_projects = re.sub(r"[^ A-Za-z1-9\-]", "", selected_projects) selected_projects = set(selected_projects.split()) return selected_projects & get_projects(data)
def spacy_lookup(dataset, notes_filename, token_counts=None, save=True): if token_counts is None: token_count_filename = get_dataset_filename(dataset, ALL_FILENAME, TOKEN_COUNT_POSTFIX, JSON_FILE_EXTENSION) token_counts = load_json(token_count_filename) nlp = spacy.load('en_vectors_web_lg') print("Creating lookup table...") no_vector_count = 0 lookup = {} for word in token_counts: doc = nlp(word[0]) if doc[0].has_vector == False: no_vector_count += 1 continue lookup[word[0]] = doc[0].vector.tolist() with open(notes_filename, "a") as notes_file: print("%d (%.0f%%) of %d dictionary words had Spacy vectors" % get_part_strings(len(lookup), len(lookup) + no_vector_count), file=notes_file) if save == True: print("Saving...") lookup_filename = get_dataset_filename(dataset, ALL_FILENAME, SPACY_LOOKUP_POSTFIX, JSON_FILE_EXTENSION) save_json(lookup_filename, lookup) print("Lookup table saved at", lookup_filename) return lookup