Esempio n. 1
0
def even_distribution(data, bin_count):
    """Create even distribution by removing data from bins with higher datapoint count than the smallest bin"""

    if data is None:
        return []

    min_timespent = min(data,
                        key=lambda datapoint: datapoint[TIMESPENT_FIELD_KEY]
                        )[TIMESPENT_FIELD_KEY]
    max_timespent = max(data,
                        key=lambda datapoint: datapoint[TIMESPENT_FIELD_KEY]
                        )[TIMESPENT_FIELD_KEY]
    timespent_range = max_timespent - min_timespent

    bins, bin_volumes = get_bins_and_volumes(data, bin_count, timespent_range)
    min_bin_volume = min(bin_volumes)
    print("Bin volumes:", *bin_volumes)

    evenly_distributed_data = []
    for i, b in enumerate(bins):
        factor = min_bin_volume / bin_volumes[i]
        for j, d in enumerate(b):
            if round(j * factor) == round((j + 1) * factor):
                continue
            evenly_distributed_data.append(d)

    print(
        "%d (%.2f%%) of %d records were selected and an even distribution was created"
        % get_part_strings(len(evenly_distributed_data), len(data)))

    return evenly_distributed_data
Esempio n. 2
0
def bing_search(query, bing_api_key):
    """Returns a list of URLs obtained in a Bing search query using Bing Web Search API v7
    https://docs.microsoft.com/en-gb/rest/api/cognitiveservices/bing-web-api-v7-reference
    
    Arguments:

    query -- Bing search query, advanced operators may be used,
    see https://msdn.microsoft.com/library/ff795620.aspx

    bing_api_key -- Bing Web Search API key which can be obtained
    at https://azure.microsoft.com/en-us/services/cognitive-services/bing-web-search-api/
    """

    PAGE_SIZE = 50
    results = set()
    totalEstimatedMatches = 0
    page = 0
    
    while page * PAGE_SIZE <= totalEstimatedMatches:

        payload = {'q': query, 'count': PAGE_SIZE, 'offset': page * PAGE_SIZE, 'responseFilter': 'Webpages'}
        headers = {'Ocp-Apim-Subscription-Key': bing_api_key}

        page = page + 1

        try:
            response = requests.get(BING_SEARCH_URL, params=payload, headers=headers)
        except requests.exceptions.RequestException:
            print("Request exception, jump over page")
            continue

        if response.status_code != 200:
            print("Unsuccessful status code %d, jump over page" % response.status_code)
            continue

        try:
            json_response = response.json()
        except json.JSONDecodeError:
            print("Could not decode response or it didn't contain an URL, jump over page")
            continue

        if json_response == None or json_response.get("webPages") == None or json_response.get("webPages").get("value") == None:
            print("Couldn't get web pages from response")
            continue

        webpages = json_response.get("webPages").get("value")
        result = set([webpage.get("url") for webpage in webpages if webpage.get("url") != None])
        
        results = results.union(result)

        totalEstimatedMatches = json_response.get("webPages").get("totalEstimatedMatches")
        totalPages = math.ceil(totalEstimatedMatches / PAGE_SIZE)

        print("%d (%.2f%%) of %d result pages processed" % get_part_strings(page, totalPages))
    
    print("%d potential JIRA instances found" % len(results))
    return results
Esempio n. 3
0
def remove_unlabeled_datapoints(data):

    labeled_data = [
        datapoint for datapoint in data if TIMESPENT_FIELD_KEY in datapoint
    ]
    if (len(labeled_data) != len(data)):
        print(
            "%d (%d%%) of %d datapoints were removed because they were unlabeled"
            % get_part_strings(len(data) - len(labeled_data), len(data)))
    return labeled_data
Esempio n. 4
0
def google_search(keyword, google_api_key, cse_id):
    """Returns a list of URLs obtained in a Google search query using Google Custom Search Engine API
    https://developers.google.com/custom-search/json-api/v1/reference/cse/list
    The Custom Search Engine instance should be configured to search the whole Web
    as described in the first two steps at https://stackoverflow.com/a/37084643

    Arguments:

    keyword -- Google search query, advanced operators may be used,
    see https://bynd.com/news-ideas/google-advanced-search-comprehensive-list-google-search-operators/

    google_api_key -- Google API key,
    see https://developers.google.com/api-client-library/python/guide/aaa_apikeys

    cse_id -- Custom Search Engine ID, see https://cse.google.com/cse/
    """

    PAGE_SIZE = 10

    service = build("customsearch", "v1", developerKey=google_api_key)
    results = set()
    total_results = 0
    page = 0

    while page * PAGE_SIZE <= total_results:

        if page % 5 == 0:
            time.sleep(5)

        try:
            res = service.cse().list(q=keyword, cx=cse_id, num=PAGE_SIZE, start=page * PAGE_SIZE + 1).execute()
            page = page + 1
            total_results = int(res.get("searchInformation").get("totalResults"))
        except Exception as e:
            print("Exception, skip page")
            print(e)
            break

        try:
            items = res.get('items')
        except:
            continue

        for item in items:
            try:
                url = item.get("link")
            except:
                continue
            results.add(url)

        totalPages = math.ceil(total_results / PAGE_SIZE)
        print("%d (%.2f%%) of %d result pages processed" % get_part_strings(page, totalPages))

    print("%d potential JIRA instances found" % len(results))
    return results
Esempio n. 5
0
def escape_short_texts(data, minimum_words):
    """Remove task with description length shorter than minimum_words"""

    filtered_data = [
        datapoint for datapoint in data
        if word_count(datapoint.get(SUMMARY_FIELD_KEY, "")) +
        word_count(datapoint.get(DESCRIPTION_FIELD_KEY, "")) >= minimum_words
    ]
    print("%d (%.2f%%) of %d records were selected" %
          get_part_strings(len(filtered_data), len(data)))
    return filtered_data
Esempio n. 6
0
def remove_small_projects(data, minimum_project_size):

    issue_counts = get_issue_counts(data)
    selected_projects = {
        issue_count[0]
        for issue_count in issue_counts
        if issue_count[1] >= minimum_project_size
    }
    print("%d (%.2f%%) of %d projects were selected" %
          get_part_strings(len(selected_projects), len(get_projects(data))))

    return selected_projects
Esempio n. 7
0
def filter_data_by_projects(data, selected_projects):

    if len(selected_projects) == 0:
        return

    selected_data = [
        datapoint for datapoint in data
        if is_in_projects(datapoint, selected_projects)
    ]
    print("%d (%.2f%%) of %d datapoints selected" %
          get_part_strings(len(selected_data), len(data)))

    return selected_data
Esempio n. 8
0
def remove_outliers(data, minimum_timespent_seconds,
                    maximum_timespent_seconds):

    print(
        "Filtering out datapoints with time spent lower than %d seconds and higher than %d seconds"
        % (minimum_timespent_seconds, maximum_timespent_seconds))
    filtered_data = [
        datapoint for datapoint in data
        if datapoint[TIMESPENT_FIELD_KEY] >= minimum_timespent_seconds
        and datapoint[TIMESPENT_FIELD_KEY] <= maximum_timespent_seconds
    ]

    print(
        "%d (%.2f%%) of %d datapoints were selected for testing and training" %
        get_part_strings(len(filtered_data), len(data)))

    return filtered_data
Esempio n. 9
0
def select_projects(data):

    print("Please select one or more of the following projects:")
    project_issue_counts = get_issue_counts([
        datapoint for datapoint in data
        if datapoint.get(TIMESPENT_FIELD_KEY, None) is not None
    ])

    for c in project_issue_counts:
        total_issue_count = sum([
            1 for datapoint in data if datapoint.get(PROJECT_FIELD_KEY) == c[0]
        ])
        part_strings = get_part_strings(c[1], total_issue_count)
        print("%s - %d (%.2f%%) of %d issues are labeled" %
              (c[0], part_strings[0], part_strings[1], part_strings[2]))

    selected_projects = input("Selected datasets: ")
    selected_projects = selected_projects.replace(",", " ")
    selected_projects = re.sub(r"[^ A-Za-z1-9\-]", "", selected_projects)
    selected_projects = set(selected_projects.split())

    return selected_projects & get_projects(data)
Esempio n. 10
0
def spacy_lookup(dataset, notes_filename, token_counts=None, save=True):

    if token_counts is None:
        token_count_filename = get_dataset_filename(dataset, ALL_FILENAME,
                                                    TOKEN_COUNT_POSTFIX,
                                                    JSON_FILE_EXTENSION)
        token_counts = load_json(token_count_filename)

    nlp = spacy.load('en_vectors_web_lg')

    print("Creating lookup table...")
    no_vector_count = 0
    lookup = {}
    for word in token_counts:

        doc = nlp(word[0])
        if doc[0].has_vector == False:
            no_vector_count += 1
            continue

        lookup[word[0]] = doc[0].vector.tolist()

    with open(notes_filename, "a") as notes_file:
        print("%d (%.0f%%) of %d dictionary words had Spacy vectors" %
              get_part_strings(len(lookup),
                               len(lookup) + no_vector_count),
              file=notes_file)

    if save == True:
        print("Saving...")
        lookup_filename = get_dataset_filename(dataset, ALL_FILENAME,
                                               SPACY_LOOKUP_POSTFIX,
                                               JSON_FILE_EXTENSION)
        save_json(lookup_filename, lookup)
        print("Lookup table saved at", lookup_filename)

    return lookup