Example #1
0
def remove_duplicate_headers(to_remove=duplicate_uid,
                             json_header_filename='headers.json'):
    """

	This function removes all the duplicate entries of the UIDs specified in the to_remove parameter. By default,
	it removes all the duplicate entries in the JSON file.

	:param to_remove: A list of UIDs that need to be removed. Default value is the list of duplicate mails' UIDs.
	:param json_header_filename: The header file from which duplicate entries are removed.
	"""
    # The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
    # In case a duplicate exists, it would be read twice and hence would fail the set membership test.
    read_uid = set([])

    if len(to_remove) > 0:
        print("Removing duplicate headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []

        with open(json_header_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in read_uid:
                    write_to_file.append(json_obj)
                read_uid.add(json_obj['Message-ID'])

        with open(json_header_filename, 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")
Example #2
0
def replace_invalid_headers(to_replace=invalid_uid,
                            json_header_filename="headers.json"):
    """

	This function removes the mail headers that have insufficient attributes and fetches those headers again.
	If an attribute is missing in the original mail header or if the mail has been deleted, this function ignores that UID.

	:param to_replace: A list of UIDs that need to be replaced. Default value is the list of invalid mails' UIDs.
	:param json_header_filename: The json file containing the headers.
	"""
    if len(to_replace) > 0:
        print("Replacing invalid headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []
        with open(json_header_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in invalid_uid:
                    write_to_file.append(json_obj)

        with open(json_header_filename, 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")

        add_missing_headers(to_replace)
Example #3
0
def remove_unwanted_headers(to_remove=unwanted_uid,
                            json_header_filename='headers.json'):
    """

	This function removes all the UIDs specified in the to_remove parameter. By default, it removes all the unwanted
	entries in the JSON file, i.e. the list of UIDs of mails that are not forwarded from LKML subscription.

	:param to_remove: A list of UIDs that need to be removed. Default value is the list of unwanted mails' UIDs.
	:param json_header_filename: The header file from which unwanted entries are removed.
	"""
    if len(to_remove) > 0:
        print("Removing unwanted headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []

        with open(json_header_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in unwanted_uid:
                    write_to_file.append(json_obj)

        with open(json_header_filename, 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")
Example #4
0
def get_uid_map(write_to_file=True):
    """

    This function is used to generate and write to a JSON file the mapping of authors to a unique integer identifier.
    Authors are identified through a regular expression search for their email addresses. The integer identifiers
    generated are used in other modules like the generation and statistical analysis of hyperedges.
    
    :param write_to_file: If true, results are written to author_uid_map.json (default=True)
    :return: A list of all message ids that are leaf nodes
    """
    index = 0
    author_set = set()
    author_uid_map = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    with open('clean_data.json', 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            from_addr = email_re.search(json_obj['From'])
            author_set.add(from_addr.group(0) if from_addr is not None else json_obj['From'])
            author_set |= set(email_re.findall(json_obj['To']))
            if json_obj['Cc'] is not None:
                author_set |= set(email_re.findall(json_obj['Cc']))
    print("JSON data loaded.")

    for address in author_set:
        author_uid_map[address] = index
        index += 1

    with open("author_uid_map.json", 'w') as map_file:
        json.dump(author_uid_map, map_file, indent=1)
        map_file.close()
    print("UID map written to author_uid_map.json.")
    return author_uid_map
Example #5
0
def get_leaf_nodes(write_to_file=True):
    """
    This function is used to compute the message-ids of leaf nodes in the thread graph.

    :param write_to_file: If true, writes a list of leaf nodes to graph_leaf_nodes.csv (default = True)
    :return: List of message-ids of leaf nodes
    """
    leaf_msgs = []  # Keeps track of all those message ids that are leaf nodes
    msg_ref_map = {}  # Map between message id of each mail to its references list

    with open('clean_data.json', 'r') as fil:
        for chunk in lines_per_n(fil, 9):

            jfile = json.loads(chunk)

            leaf_msgs.append(jfile['Message-ID'])
            msg_ref_map[jfile['Message-ID']] = str(jfile['References'])

            if not (jfile['References'] == None):
                leaf_msgs = get_current_leaf_nodes(leaf_msgs, jfile['References'].split(','))

        fil.close()

    with open('graph_leaf_nodes.csv', 'w') as csv_file:
        for msg_id in leaf_msgs:
            csv_file.write("{0};{1}\n".format(msg_id, msg_ref_map[msg_id]))
    csv_file.close()
    return leaf_msgs
def generate_edge_list(nodelist_filename='graph_nodes.csv',
                       edgelist_filename='graph_edges.csv',
                       json_filename='clean_data.json'):
    """
    This function generates a list of nodes and edges in the graphs from the JSON file and saves it as a CSV file.
    :param ref_toggle: If True, References attribute is used to make edges and if False, In-Reply-To is used.
    """
    # The following set stores all the mail UIDs and the corresponding time as a semi-colon separated string
    nodes = set()
    edges = set()
    with open(json_filename, 'r') as fil:
        for chunk in lines_per_n(fil, 9):
            jfile = json.loads(chunk)
            msg_id = jfile['Message-ID']
            msg_time = jfile['Time']
            msg_from = "".join(jfile['From'].split())
            nodes.add(str(msg_id) + ";" + msg_from + ";" + msg_time)
            if jfile['References']:
                ref_list = str(jfile['References']).split(',')
                # Message Id of the parent mail is appended to the end of the list of references.
                parent_id = int(ref_list[-1])
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
            if jfile['In-Reply-To']:
                parent_id = jfile['In-Reply-To']
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
    with open(nodelist_filename, 'w') as node_file:
        for node_str in nodes:
            node_file.write(node_str + "\n")
    with open(edgelist_filename, 'w') as edge_file:
        for parent_id, msg_id in edges:
            edge_file.write(str(parent_id) + ';' + str(msg_id) + "\n")
Example #7
0
def generate_node_labels(nodelist_filename='graph_nodes.txt',
                         edgelist_filename='graph_edges.txt',
                         json_filename='clean_data.json'):
    """

    This function generates a list of nodes and edges in the graphs from the JSON file and saves it as a TXT file.

    :param nodelist_filename: txt file to store the graph nodes.
    :param edgelist_filename: txt file to store the graph edges.
    :param json_filename: The JSON file containing the cleaned headers.
    """
    # The following set stores all the mail UIDs and the corresponding time as a semi-colon separated string
    nodes = set()
    edges = set()
    with open(json_filename, 'r') as fil:
        for chunk in lines_per_n(fil, 9):
            jfile = json.loads(chunk)
            msg_id = jfile['Message-ID']
            msg_time = jfile['Time']
            msg_from = "".join(jfile['From'].split())
            nodes.add(str(msg_id) + ",")
            if jfile['References']:
                ref_list = str(jfile['References']).split(',')
                # Message Id of the parent mail is appended to the end of the list of references.
                parent_id = int(ref_list[-1])
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
            if jfile['In-Reply-To']:
                parent_id = jfile['In-Reply-To']
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
    with open(nodelist_filename, 'w') as node_file:
        for node_str in nodes:
            node_file.write(node_str + "\n")
    with open(edgelist_filename, 'w') as edge_file:
        for parent_id, msg_id in edges:
            edge_file.write(str(parent_id) + '\t' + str(msg_id) + "\n")
Example #8
0
def check_validity(check_unavailable_uid='False',
                   json_header_filename='headers.json'):
    """

	This function checks for and prints duplicate, missing, and invalid objects in the "headers.json" file.
	This function can be run first to generate a list of duplicate, missing, or invalid objects' UIDs which
	can then be used to add or remove their entries from the JSON file.

	:param check_unavailable_uid: If true, prints the unavailable and unwanted uids
	:param json_header_filename: The header file to be parsed
	:return: Last UID that was checked by the function.
	"""
    previous_uid = 0

    # The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
    # In case a duplicate exists, it would be read twice and hence would fail the set membership test.
    read_uid = set([])

    # This variable contains the last UID that was checked. This variable is returned by the function.
    last_valid_uid = 0

    header_attrib = {'Message-ID', 'From', 'To', 'Cc', 'In-Reply-To', 'Time'}

    # Read UIDs of mails that are not forwarded from LKML subscription which is stored in a text file.

    with open(json_header_filename, 'r') as json_file:

        for chunk in lines_per_n(json_file, 9):
            try:
                json_obj = json.loads(chunk)
            except:
                print("Unreadable JSON object after UID: " + str(previous_uid))
                break

            # Checking for duplicate objects
            if not json_obj['Message-ID'] in read_uid:
                read_uid.add(json_obj['Message-ID'])
            else:
                duplicate_uid.add(json_obj['Message-ID'])

            # Check if the JSON object has sufficient attributes by checking if "header_attrib" is a subset of its keys
            if not set(header_attrib) <= json_obj.keys(
            ) or json_obj['Time'] is None:
                invalid_uid.add(json_obj['Message-ID'])

            # Check if it is a mail that is sent directly to "*****@*****.**", in which caseit has not been
            # forwarded from the LKML subscription.
            if json_obj['To'] == "*****@*****.**":
                unwanted_uid.add(json_obj['Message-ID'])

            previous_uid = json_obj['Message-ID']

    # Calculate the missing UIDs by performing a set difference on all the UIDs possible till the highest UID read
    # from the actual UIDs that have been read.
    if previous_uid != 0:
        global last_uid_read
        last_uid_read = max(read_uid)
        global missing_uid
        missing_uid = set(range(min(read_uid), last_uid_read + 1)) - read_uid
        global unavailable_uid

    if check_unavailable_uid:
        unavailable_uid = get_unavailable_uid()
        print("Unavailable UIDs: ",
              unavailable_uid if len(unavailable_uid) > 0 else "None")
        with open("unwanted_uid.txt", 'a') as unw_file:
            for uid in unwanted_uid:
                unw_file.write(str(uid) + '\n')
        print("Unwanted UIDs: ",
              unwanted_uid if len(unwanted_uid) > 0 else "None")

    print("Duplicate UIDs: ",
          duplicate_uid if len(duplicate_uid) > 0 else "None")
    print("Missing UIDs: ", missing_uid if len(missing_uid) > 0 else "None")
    print("Invalid UIDs: ", invalid_uid if len(invalid_uid) > 0 else "None")
    return last_uid_read
def remove_invalid_references(input_json_filename,
                              output_json_filename,
                              ref_toggle=False):

    # The "unspecified_ref" list is used to keep track of all those mails that have '0' in their reference list.
    # If any mail has any of the element in this list in its list of references, we can eliminate them as well
    unspecified_ref = ['0']

    print("Removing headers associated with invalid references...")

    with open(input_json_filename, 'r') as fil:
        with open(output_json_filename, mode='w',
                  encoding='utf-8') as fin_file:

            for chunk in lines_per_n(fil, 9):
                # The "jfile" is used to store the json object read from the file.
                jfile = json.loads(chunk)
                """
                Mails that have references that are of type None indicate that they maybe the start of threads.
                Anything else could be mail in a thread or something else.
                """
                if jfile['References'] is not None:
                    # Checking if the references is an empty string
                    if not jfile['References'] == "":
                        # The references are stored as a comma separated string. We have to split it at the ',' to get a list.
                        if ref_toggle:
                            ref_list = jfile['References'].split(',')
                        else:
                            if jfile['In-Reply-To'] is not None:
                                ref_list = [str(jfile['In-Reply-To'])]
                            else:
                                ref_list = None
                        # A '0' in the list indicates that the mail contains references to some other mail which is not available to us
                        if '0' not in ref_list or ref_list is None:
                            data = {}
                            data['Message-ID'] = jfile['Message-ID']
                            data['From'] = jfile['From']
                            data['To'] = jfile['To']
                            data['Cc'] = jfile['Cc']
                            data['In-Reply-To'] = jfile['In-Reply-To']
                            data['References'] = jfile['References']
                            data['Time'] = jfile['Time']
                            contain_unspec_ref = False

                            # This is done to eliminate all those mails whose reference list contains mails that have '0' in their reference list
                            for ref in ref_list:
                                if ref in unspecified_ref:
                                    contain_unspec_ref = True
                            if not contain_unspec_ref:
                                json.dump(data, fin_file, indent=1)
                                fin_file.write('\n')
                        else:
                            unspecified_ref.append(str(jfile['Message-ID']))

                # Writing all those mails that have None as their References
                else:
                    data = {}
                    data['Message-ID'] = jfile['Message-ID']
                    data['From'] = jfile['From']
                    data['To'] = jfile['To']
                    data['Cc'] = jfile['Cc']
                    data['In-Reply-To'] = jfile['In-Reply-To']
                    data['References'] = jfile['References']
                    data['Time'] = str(jfile['Time'])
                    json.dump(data, fin_file, indent=1)
                    fin_file.write('\n')

        fin_file.close()
    fil.close()