def get_or_update_user(user, buffer_db=user_buffer):
        # fix encoding for name and e-mail address
        if user["name"] is not None:
            name = unicode(user["name"]).encode("utf-8")
        else:
            name = unicode(user["username"]).encode("utf-8")
        mail = unicode(user["email"]).encode("utf-8")
        # construct string for ID service and send query
        user_string = get_user_string(name, mail)

        # check buffer to reduce amount of DB queries
        if user_string in buffer_db:
            log.devinfo("Returning user '{}' from buffer.".format(user_string))
            return buffer_db[user_string]

        # get person information from ID service
        log.devinfo("Passing user '{}' to ID service.".format(user_string))
        idx = idservice.getPersonID(user_string)

        # update user data with person information from DB
        person = idservice.getPersonFromDB(idx)
        user["email"] = person["email1"]  # column "email1"
        user["name"] = person["name"]  # column "name"
        user["id"] = person["id"]  # column "id"

        # add user information to buffer
        # user_string = get_user_string(user["name"], user["email"]) # update for
        buffer_db[user_string] = user

        return user
    def get_or_update_user(user, buffer_db=user_buffer):
        # fix encoding for name and e-mail address
        if user["name"] is not None:
            name = unicode(user["name"]).encode("utf-8")
        else:
            name = unicode(user["username"]).encode("utf-8")
        mail = unicode(user["email"]).encode("utf-8")
        # construct string for ID service and send query
        user_string = get_user_string(name, mail)

        # check buffer to reduce amount of DB queries
        if user_string in buffer_db:
            log.devinfo("Returning user '{}' from buffer.".format(user_string))
            return buffer_db[user_string]

        # get person information from ID service
        log.devinfo("Passing user '{}' to ID service.".format(user_string))
        idx = idservice.getPersonID(user_string)

        # update user data with person information from DB
        person = idservice.getPersonFromDB(idx)
        user["email"] = person["email1"]  # column 'email1'
        user["name"] = person["name"]  # column 'name'
        user["id"] = person["id"]  # column 'id'

        # add user information to buffer
        # user_string = get_user_string(user["name"], user["email"]) # update for
        buffer_db[user_string] = user

        return user
Ejemplo n.º 3
0
def __parse_execute(artifact, schema, index, include_filepath):
    """ Execute the search for the given commit

    :param artifact: the (file name, artifact) tuple to search for
    :param schema: the search schema to use
    :param index: the search index to use
    :param include_filepath: indicator whether to use the 'file name' part of the artifact into account
    :return: a match list of tuples (file name, artifact, message ID)
    """

    log.devinfo("Searching for artifact ({}, {})...".format(
        artifact[0], artifact[1]))

    result = []

    with index.searcher() as searcher:
        # initialize query parser
        query_parser = QueryParser("content", schema=schema)

        # construct query
        if include_filepath:
            my_query = query_parser.parse(artifact[0] + " AND " + artifact[1])
        else:
            my_query = query_parser.parse(artifact[1])

        # search!
        query_result = searcher.search(my_query, terms=True)

        # construct result from query answer
        for r in query_result:
            result_tuple = (artifact[0], artifact[1], r["messageID"])
            result.append(result_tuple)

    return result
Ejemplo n.º 4
0
def __mbox_getbody(message):
    """ Gett plain-text e-mail body for better searching.

    :param message: the mbox message to process
    :return: the unicode-encoded message body
    """

    __text_indicator = "text/"

    body = None
    if message.is_multipart():
        for part in message.walk():
            if part.is_multipart():
                for subpart in part.walk():
                    if __text_indicator in subpart.get_content_type():
                        body = subpart.get_payload(decode=True)
            elif __text_indicator in part.get_content_type():
                body = part.get_payload(decode=True)
    elif __text_indicator in message.get_content_type():
        body = message.get_payload(decode=True)

    if body is None:
        log.devinfo(message.get_content_type())
        log.devinfo(
            "An image or some other content has been found that cannot be indexed. Message is given an empty body."
        )
        body = ' '

    return unicode(body, errors="replace")
def load_xml(source_folder):
    """Load issues from disk.

    :param source_folder: the folder where to find .xml-files
    :return: the loaded issue data
    """

    filelist = [
        f for f in os.listdir(source_folder)
        if os.path.isfile(os.path.join(source_folder, f))
    ]
    issue_data = list()
    for file in filelist:
        srcfile = os.path.join(source_folder, file)
        log.devinfo("Loading issues from file '{}'...".format(srcfile))

        # check if file exists and exit early if not
        if not os.path.exists(srcfile):
            log.info("Issue file '{}' does not exist! Exiting early...".format(
                srcfile))
            sys.exit(-1)

        # with open(srcfile, 'r') as issues_file:
        xmldoc = parse(srcfile)
        issue_data.append(xmldoc)

    return issue_data
Ejemplo n.º 6
0
    def get_id_and_update_user(user, buffer_db_ids=user_id_buffer):
        # fix encoding for name and e-mail address
        if user["name"] is not None and user["name"] != "":
            name = unicode(user["name"]).encode("utf-8")
        else:
            name = unicode(user["username"]).encode("utf-8")
        mail = unicode(user["email"]).encode("utf-8")  # empty
        # construct string for ID service and send query
        user_string = get_user_string(name, mail)

        # check buffer to reduce amount of DB queries
        if user_string in buffer_db_ids:
            log.devinfo(
                "Returning person id for user '{}' from buffer.".format(
                    user_string))
            return buffer_db_ids[user_string]

        # get person information from ID service
        log.devinfo("Passing user '{}' to ID service.".format(user_string))
        idx = idservice.getPersonID(user_string)

        # add user information to buffer
        # user_string = get_user_string(user["name"], user["email"]) # update for
        buffer_db_ids[user_string] = idx

        return idx
Ejemplo n.º 7
0
def reformat_issues(issue_data):
    """
    Re-arrange issue data structure.

    :param issue_data: the issue data to re-arrange
    :return: the re-arranged issue data
    """

    log.devinfo("Re-arranging Github issues...")

    # re-process all issues
    for issue in issue_data:

        # empty container for issue types
        issue["type"] = []

        # empty container for issue resolutions
        issue["resolution"] = []

        # if an issue has no eventsList, an empty List gets created
        if issue["eventsList"] is None:
            issue["eventsList"] = []

        # if an issue has no commentsList, an empty List gets created
        if issue["commentsList"] is None:
            issue["commentsList"] = []

        # if an issue has no relatedCommits, an empty List gets created
        if issue["relatedCommits"] is None:
            issue["relatedCommits"] = []

        # if an issue has no reviewsList, an empty Listgets created
        if issue["reviewsList"] is None:
            issue["reviewsList"] = []

        # if an issue has no relatedIssues, an empty List gets created
        if "relatedIssues" not in issue:
            issue["relatedIssues"] = []

        # add "closed_at" information if not present yet
        if issue["closed_at"] is None:
            issue["closed_at"] = ""

        # parses the creation time in the correct format
        issue["created_at"] = format_time(issue["created_at"])

        # parses the close time in the correct format
        issue["closed_at"] = format_time(issue["closed_at"])

        # checks if the issue is a pull-request or a normal issue and adapts the type
        if issue["isPullRequest"]:
            issue["type"].append("pull request")
        else:
            issue["type"].append("issue")

    return issue_data
def reformat(issue_data):
    """Re-arrange issue data structure.

    :param issue_data: the issue data to re-arrange
    :return: the re-arranged issue data
    """

    log.devinfo("Re-arranging Github issues...")

    # re-process all issues
    for issue in issue_data:
        # temporary container for references
        comments = dict()

        # initialize event
        created_event = dict()
        created_event["user"] = issue["user"]
        created_event["created_at"] = issue["created_at"]
        created_event["event"] = "created"
        issue["eventsList"].append(created_event)

        # add event name to comment and add reference target
        for comment in issue["commentsList"]:
            comment["event"] = "commented"
            comment["ref_target"] = ""
            # cache comment by date to resolve/re-arrange references later
            comments[comment["created_at"]] = comment

        # add reference target to events
        for event in issue["eventsList"]:
            event["ref_target"] = ""
            # if event collides with a comment
            if event["created_at"] in comments:
                comment = comments[event["created_at"]]
                # if someone gets mentioned or subscribed by someone else in a comment,
                # re-write the reference
                if (event["event"] == "mentioned" or event["event"] == "subscribed") and \
                                comment["event"] == "commented":
                    event["ref_target"] = event["user"]
                    event["user"] = comment["user"]

        # merge events and comment lists
        issue["eventsList"] = issue["commentsList"] + issue["eventsList"]

        # add 'closed_at' information if not present yet
        if issue["closed_at"] is None:
            issue["closed_at"] = ""

        # remove events without user
        issue["eventsList"] = [
            event for event in issue["eventsList"]
            if not (event["user"] is None or event["ref_target"] is None)
        ]

    return issue_data
Ejemplo n.º 9
0
def load_csv(source_folder):
    """
    Load persons from disk.

    :param source_folder: the folder where to find .csv-file
    :return: the loaded person data contained in a dict consisting of two maps:
             keys are either name ("by_name") or username ("by_username"), values are name-email pairs
    """
    def find_first_existing(source_folder, filenames):
        """
        Check if any of the given file names exist in the given folder and return the first existing.

        :param source_folder: the folder where to search for the given file names
        :param filenames: the file names to search for
        :return: the first existing file name, None otherwise
        """

        filenames = map(lambda fi: os.path.join(source_folder, fi), filenames)
        existing = map(lambda fi: os.path.exists(fi), filenames)
        first = next((i for (i, x) in enumerate(existing) if x), None)

        if first is not None:
            return filenames[first]
        else:
            return None

    person_files = ("jira-comment-authors-with-email.csv",
                    "jira_issue_comments.csv")
    srcfile = find_first_existing(source_folder, person_files)

    # check if file exists and exit early if not
    if not srcfile:
        log.error("Person files '{}' do not exist! Exiting early...".format(
            person_files))
        sys.exit(-1)

    log.devinfo("Loading person csv from file '{}'...".format(srcfile))
    with open(srcfile, "r") as f:
        person_data = csv.DictReader(f, delimiter=",", skipinitialspace=True)
        persons_by_username = {}
        persons_by_name = {}
        for row in person_data:
            if not row["AuthorID"] in persons_by_username.keys():
                author_id_utf8 = unicode(row["AuthorID"]).encode("utf-8")
                persons_by_username[author_id_utf8] = (row["AuthorName"],
                                                       row["userEmail"])
            if not row["AuthorName"] in persons_by_name.keys():
                author_name_utf8 = unicode(row["AuthorName"]).encode("utf-8")
                persons_by_name[author_name_utf8] = (row["AuthorName"],
                                                     row["userEmail"])

        persons = dict()
        persons["by_username"] = persons_by_username
        persons["by_name"] = persons_by_name
    return persons
Ejemplo n.º 10
0
def parse(mbox_name, results, include_filepath):
    mbox = mailbox.mbox(mbox_name)

    my_schema = Schema(messageID=ID(stored=True), content=TEXT)
    index_path = results + "/index"
    # The index for Whoosh full text search is being created. If an index already exists this step won't be performed
    if (not os.path.exists(index_path)) or (not exists_in(index_path)):
        log.devinfo("Creating Index in results folder for text search.")
        os.mkdir(index_path)
        ix = create_in(index_path, my_schema)
        ix = open_dir(index_path)
        writer = ix.writer()
        for message in mbox:
            writer.add_document(messageID=unicode(message['message-id']),
                                content=getbody(message))
        writer.commit()
        log.devinfo("Index created, parsing will begin now.")
    else:
        log.devinfo(
            "Index has already been created, parsing will begin right away.")
        ix = open_dir(index_path)

    # Get the search terms from the commits.list file
    commit_list = open(results + "/commits.list", 'r')
    commits_seperated = []
    commits = []
    commit_set = set()
    for line in commit_list:
        commits.append(line)
    for commit in commits:
        commit_seperated = str.split(commit, ';')
        commit_set.add((commit_seperated[9], commit_seperated[10]))
        commits_seperated.append(commit_seperated)
    if include_filepath:
        my_file = open(results + "mboxParsing_filepath.csv", 'w')
    else:
        my_file = open(results + "mboxParsing.csv", 'w')

    # Paralell execution call for the text search.
    num_cores = multiprocessing.cpu_count()
    csv_data = Parallel(n_jobs=num_cores - 1)(
        delayed(execute)(commit, my_schema, ix, include_filepath)
        for commit in commit_set)

    # Writes found hits to file.
    log.devinfo("Parsing done writing to file commences.")
    wr = csv.writer(my_file, delimiter=';')
    wr.writerow(('file', 'artifact', 'message_id'))
    for entry in csv_data:
        for row in entry:
            wr.writerow(row)
    my_file.close()
    log.devinfo("Writing done and file closed.")
def reformat_issues(issue_data):
    """
    Re-arrange issue data structure.

    :param issue_data: the issue data to re-arrange
    :return: the re-arranged issue data
    """

    log.devinfo("Re-arranging Github issues...")

    # re-process all issues
    for issue in issue_data:

        # empty container for issue types
        issue["type"] = []

        # empty container for issue resolutions
        issue["resolution"] = []

        # if an issue has no eventsList, an empty List gets created
        if issue["eventsList"] is None:
            issue["eventsList"] = []

        # if an issue has no commentsList, an empty List gets created
        if issue["commentsList"] is None:
            issue["commentsList"] = []

        # if an issue has no relatedCommits, an empty List gets created
        if issue["relatedCommits"] is None:
            issue["relatedCommits"] = []

        # if an issue has no relatedIssues, an empty List gets created
        if "relatedIssues" not in issue:
            issue["relatedIssues"] = []

        # add "closed_at" information if not present yet
        if issue["closed_at"] is None:
            issue["closed_at"] = ""

        # parses the creation time in the correct format
        issue["created_at"] = format_time(issue["created_at"])

        # parses the close time in the correct format
        issue["closed_at"] = format_time(issue["closed_at"])

        # checks if the issue is a pull-request or a normal issue and adapts the type
        if issue["isPullRequest"]:
            issue["type"].append("pull request")
        else:
            issue["type"].append("issue")

    return issue_data
Ejemplo n.º 12
0
def execute(commit, my_schema, ix, include_filepath):
    result = []
    with ix.searcher() as searcher:
        query_parser = QueryParser("content", schema=my_schema)
        if include_filepath:
            my_query = query_parser.parse(commit[0] + " AND " + commit[1])
        else:
            my_query = query_parser.parse(commit[1])
        query_result = searcher.search(my_query, terms=True)
        for r in query_result:
            result_tuple = (commit[0][1:-1], commit[1][1:-1], r["messageID"])
            result.append(result_tuple)
        log.devinfo("Artifact " + commit[0][1:-1] + ", " + commit[1][1:-1] +
                    " done!")
        return result
def load_csv(source_folder):
    """Load persons from disk.

    :param source_folder: the folder where to find .csv-file
    :return: the loaded person data
    """
    def find_first_existing(source_folder, filenames):
        """
        Check if any of the given file names exist in the given folder and return the first existing.

        :param source_folder: the folder where to search for the given file names
        :param filenames: the file names to search for
        :return: the first existing file name, None otherwise
        """

        filenames = map(lambda fi: os.path.join(source_folder, fi), filenames)
        existing = map(lambda fi: os.path.exists(fi), filenames)
        first = next((i for (i, x) in enumerate(existing) if x), None)

        if first is not None:
            return filenames[first]
        else:
            return None

    person_files = ("jira-comment-authors-with-email.csv",
                    "jira_issue_comments.csv")
    srcfile = find_first_existing(source_folder, person_files)

    # check if file exists and exit early if not
    if not srcfile:
        log.error("Person files '{}' do not exist! Exiting early...".format(
            person_files))
        sys.exit(-1)

    log.devinfo("Loading person csv from file '{}'...".format(srcfile))
    with open(srcfile, 'r') as f:
        person_data = csv.DictReader(f, delimiter=',', skipinitialspace=True)
        persons = {}
        for row in person_data:
            if not row['AuthorID'] in persons.keys():
                persons[row['AuthorID']] = (row['AuthorName'],
                                            row['userEmail'])

    return persons
def load_xml(source_folder, xml_file):
    """
    Load issues from disk.

    :param source_folder: the folder where to .xml-file is in
    :param xml_file: the given xml-file
    :return: the loaded issue data
    """

    srcfile = os.path.join(source_folder, xml_file)
    log.devinfo("Loading issues from file '{}'...".format(srcfile))

    try:
        # parse the xml-file
        issue_data = parse(srcfile)
        return issue_data
    except Exception as e:
        log.info("Issue file " + format(srcfile) + " couldn't be opened because of a " + e.__class__.__name__)
        return None
def load(source_folder):
    """Load issues from disk.

    :param source_folder: the folder where to find 'issues.json'
    :return: the loaded issue data
    """

    srcfile = os.path.join(source_folder, "issues.json")
    log.devinfo("Loading Github issues from file '{}'...".format(srcfile))

    # check if file exists and exit early if not
    if not os.path.exists(srcfile):
        log.error("Github issue file '{}' does not exist! Exiting early...".format(srcfile))
        sys.exit(-1)

    with open(srcfile) as issues_file:
        issue_data = json.load(issues_file)

    return issue_data
Ejemplo n.º 16
0
    def get_user_from_id(idx, buffer_db=user_buffer):

        # check whether user information is in buffer to reduce amount of DB queries
        if idx in buffer_db:
            log.devinfo("Returning user '{}' from buffer.".format(idx))
            return buffer_db[idx]

        # get person information from ID service
        log.devinfo("Passing user id '{}' to ID service.".format(idx))
        person = idservice.getPersonFromDB(idx)
        user = dict()
        user["email"] = person["email1"]  # column "email1"
        user["name"] = person["name"]  # column "name"
        user["id"] = person["id"]  # column "id"

        # add user information to buffer
        buffer_db[idx] = user

        return user
Ejemplo n.º 17
0
def getbody(message):
    body = None
    if message.is_multipart():
        for part in message.walk():
            if part.is_multipart():
                for subpart in part.walk():
                    if 'text/' in subpart.get_content_type():
                        body = subpart.get_payload(decode=True)
            elif 'text/' in part.get_content_type():
                body = part.get_payload(decode=True)
    elif 'text/' in message.get_content_type():
        body = message.get_payload(decode=True)

    if body is None:
        log.devinfo(message.get_content_type())
        log.devinfo(
            "An image or some other content that can not be indexed has been found. Message is given an empty body."
        )
        body = ' '
    return unicode(body, errors="replace")
Ejemplo n.º 18
0
def __get_index(mbox, mbox_path, results_folder, schema, reindex):
    """Initialize the search index (and create it, if needed

    :param mbox: the mbox object to create the index for
    :param mbox_path: the path to the mbox object on disk
    :param results_folder: the folder to create the index folder in
    :param schema: the schema for the to be created index
    :param reindex: force reindexing if True
    :return: the opened index object
    """

    # create or load index:
    # 0) construct index path
    index_path = os.path.join(results_folder, "mbox-index",
                              os.path.basename(mbox_path))
    # 1) if reindexing, remove the index folder
    if os.path.exists(index_path) and reindex:
        log.devinfo("Removing index from path '{}'...".format(index_path))
        shutil.rmtree(index_path)
    # 2) Check if we need to create the index for Whoosh full-text search
    log.devinfo("Checking for index in results folder...")
    if (not os.path.exists(index_path)) or (not index.exists_in(index_path)):
        # 2.1) create index
        log.devinfo("Creating index for text search in results folder.")
        os.makedirs(index_path)  # create path
        index.create_in(index_path, schema)  # initialize as index path
        ix = index.open_dir(index_path)  # open as index path
        writer = ix.writer()
        # add all messages to index
        for message in mbox:
            writer.add_document(messageID=unicode(message['message-id']),
                                content=__mbox_getbody(message))
        writer.commit()
        log.devinfo("Index created, parsing will begin now.")
    else:
        # 2.2) load index
        log.devinfo(
            "Index has already been created, parsing will begin right away.")
        ix = index.open_dir(index_path)

    return ix
def load(source_folder):
    """Load issues from disk.

    :param source_folder: the folder where to find 'issues.json'
    :return: the loaded issue data
    """

    srcfile = os.path.join(source_folder, "issues.json")
    log.devinfo("Loading Github issues from file '{}'...".format(srcfile))

    # check if file exists and exit early if not
    if not os.path.exists(srcfile):
        log.error(
            "Github issue file '{}' does not exist! Exiting early...".format(
                srcfile))
        sys.exit(-1)

    with open(srcfile) as issues_file:
        issue_data = json.load(issues_file)

    return issue_data