def messages_to_dataframe(messages): """ Turn a list of parsed messages into a dataframe of message data, indexed by message-id, with column-names from headers. """ def safe_unicode(t): return t and unicode(t, 'utf-8', 'ignore') # extract data into a list of tuples -- records -- with # the Message-ID separated out as an index pm = [(m.get('Message-ID'), safe_unicode(m.get('From')), safe_unicode(m.get('Subject')), get_date(m), safe_unicode(m.get('In-Reply-To')), safe_unicode(m.get('References')), get_text(m)) for m in messages if m.get('Message-ID')] mdf = pd.DataFrame.from_records(list(pm), index='Message-ID', columns=['Message-ID', 'From', 'Subject', 'Date', 'In-Reply-To', 'References', 'Body']) mdf.index.name = 'Message-ID' return mdf
def messages_to_dataframe(messages): """ Turn a list of parsed messages into a dataframe of message data, indexed by message-id, with column-names from headers. """ # extract data into a list of tuples -- records -- with # the Message-ID separated out as an index # valid_messages = [m for m in messages if m.get() pm = [( m.get("Message-ID"), str(m.get("From")).replace("\\", " "), str(m.get("Subject")), get_date(m), str(m.get("In-Reply-To")), str(m.get("References")), get_text(m), ) for m in messages if m.get("From")] mdf = pd.DataFrame.from_records( list(pm), index="Message-ID", columns=[ "Message-ID", "From", "Subject", "Date", "In-Reply-To", "References", "Body", ], ) mdf.index.name = "Message-ID" return mdf
def messages_to_dataframe(self,messages): # extract data into a list of tuples -- records -- with # the Message-ID separated out as an index pm = [(m.get('Message-ID'), (m.get('From'), m.get('Subject'), get_date(m), m.get('In-Reply-To'), m.get('References'), m.get_payload())) for m in messages if m.get('Message-ID')] ids,records = zip(*pm) mdf = pd.DataFrame.from_records(list(records), index=list(ids), columns=['From', 'Subject', 'Date', 'In-Reply-To', 'References', 'Body']) mdf.index.name = 'Message-ID' return mdf
def messages_to_dataframe(messages): """ Turn a list of parsed messages into a dataframe of message data, indexed by message-id, with column-names from headers. """ def safe_unicode(t): return t and unicode(t, "utf-8", "ignore") # extract data into a list of tuples -- records -- with # the Message-ID separated out as an index pm = [ ( m.get("Message-ID"), safe_unicode(m.get("From")), safe_unicode(m.get("Subject")), get_date(m), safe_unicode(m.get("In-Reply-To")), safe_unicode(m.get("References")), get_text(m), ) for m in messages if m.get("Message-ID") ] mdf = pd.DataFrame.from_records( list(pm), index="Message-ID", columns=["Message-ID", "From", "Subject", "Date", "In-Reply-To", "References", "Body"], ) mdf.index.name = "Message-ID" return mdf
def compute_ascendancy(messages, duration=50): """Compute ascendancy given messages.""" print("compute ascendancy") dated_messages = {} for m in messages: d = parse.get_date(m) if d is not None and d < datetime.datetime.now(pytz.utc): o = d.toordinal() dated_messages[o] = dated_messages.get(o, []) dated_messages[o].append(m) days = [k for k in list(dated_messages.keys())] day_offset = min(days) epoch = max(days) - min(days) ascendancy = np.zeros([max(days) - min(days) + 1]) capacity = np.zeros(([max(days) - min(days) + 1])) for i in range(epoch): min_d = min(days) + i max_d = min_d + duration block_messages = [] for d in range(min_d, max_d): block_messages.extend(dated_messages.get(d, [])) b_IG = messages_to_interaction_graph(block_messages) b_matrix = interaction_graph_to_matrix(b_IG) ascendancy[min_d - day_offset] = ascendancy(b_matrix) capacity[min_d - day_offset] = capacity(b_matrix) return ascendancy, capacity
def messages_to_dataframe(messages): """ Turn a list of parsed messages into a dataframe of message data, indexed by message-id, with column-names from headers. """ # extract data into a list of tuples -- records -- with # the Message-ID separated out as an index #valid_messages = [m for m in messages if m.get() pm = [(m.get('Message-ID'), str(m.get('From')).replace('\\', ' '), str(m.get('Subject')), get_date(m), str(m.get('In-Reply-To')), str(m.get('References')), get_text(m)) for m in messages if m.get('From')] mdf = pd.DataFrame.from_records(list(pm), index='Message-ID', columns=[ 'Message-ID', 'From', 'Subject', 'Date', 'In-Reply-To', 'References', 'Body' ]) mdf.index.name = 'Message-ID' return mdf
import pytz url = "http://mail.scipy.org/pipermail/scipy-dev/" messages = mailman.open_list_archives(url) dates = [] froms = [] broke = [] for m in messages: m_from = m.get('From') froms.append(m_from) try: date = get_date(m) dates.append(date) except Exception as e: print e dates.append(pd.NaT) broke.append(m) # just drop the missing values for now data = pd.DataFrame({'Date':dates,'From':froms}).dropna() # because sometimes somebody sends a messages from the future data = data[data['Date'] < datetime.datetime.now(pytz.utc)] ### I've been having trouble getting traction with pandas