예제 #1
0
def messages_to_dataframe(messages):
    """
    Turn a list of parsed messages into a dataframe of message data,
    indexed by message-id, with column-names from headers.

    """
    def safe_unicode(t):
        return t and unicode(t, 'utf-8', 'ignore')
    # extract data into a list of tuples -- records -- with
    # the Message-ID separated out as an index
    pm = [(m.get('Message-ID'),
           safe_unicode(m.get('From')),
           safe_unicode(m.get('Subject')),
           get_date(m),
           safe_unicode(m.get('In-Reply-To')),
           safe_unicode(m.get('References')),
           get_text(m))
          for m in messages if m.get('Message-ID')]

    mdf = pd.DataFrame.from_records(list(pm),
                                    index='Message-ID',
                                    columns=['Message-ID', 'From',
                                             'Subject',
                                             'Date',
                                             'In-Reply-To',
                                             'References',
                                             'Body'])
    mdf.index.name = 'Message-ID'

    return mdf
예제 #2
0
def messages_to_dataframe(messages):
    """
    Turn a list of parsed messages into a dataframe of message data,
    indexed by message-id, with column-names from headers.

    """
    def safe_unicode(t):
        return t and unicode(t, 'utf-8', 'ignore')
    # extract data into a list of tuples -- records -- with
    # the Message-ID separated out as an index
    pm = [(m.get('Message-ID'),
           safe_unicode(m.get('From')),
           safe_unicode(m.get('Subject')),
           get_date(m),
           safe_unicode(m.get('In-Reply-To')),
           safe_unicode(m.get('References')),
           get_text(m))
          for m in messages if m.get('Message-ID')]

    mdf = pd.DataFrame.from_records(list(pm),
                                    index='Message-ID',
                                    columns=['Message-ID', 'From',
                                             'Subject',
                                             'Date',
                                             'In-Reply-To',
                                             'References',
                                             'Body'])
    mdf.index.name = 'Message-ID'

    return mdf
예제 #3
0
def messages_to_dataframe(messages):
    """
    Turn a list of parsed messages into a dataframe of message data,
    indexed by message-id, with column-names from headers.
    """
    # extract data into a list of tuples -- records -- with
    # the Message-ID separated out as an index
    # valid_messages = [m for m in messages if m.get()

    pm = [(
        m.get("Message-ID"),
        str(m.get("From")).replace("\\", " "),
        str(m.get("Subject")),
        get_date(m),
        str(m.get("In-Reply-To")),
        str(m.get("References")),
        get_text(m),
    ) for m in messages if m.get("From")]

    mdf = pd.DataFrame.from_records(
        list(pm),
        index="Message-ID",
        columns=[
            "Message-ID",
            "From",
            "Subject",
            "Date",
            "In-Reply-To",
            "References",
            "Body",
        ],
    )
    mdf.index.name = "Message-ID"

    return mdf
예제 #4
0
파일: archive.py 프로젝트: Jack005/bigbang
    def messages_to_dataframe(self,messages):
        # extract data into a list of tuples -- records -- with
        # the Message-ID separated out as an index 
        pm = [(m.get('Message-ID'), 
               (m.get('From'),
                m.get('Subject'),
                get_date(m),
                m.get('In-Reply-To'),
                m.get('References'),
                m.get_payload()))
              for m in messages if m.get('Message-ID')]

        ids,records = zip(*pm)

        mdf = pd.DataFrame.from_records(list(records),
                                        index=list(ids),
                                        columns=['From',
                                                 'Subject',
                                                 'Date',
                                                 'In-Reply-To',
                                                 'References',
                                                 'Body'])
        mdf.index.name = 'Message-ID'
        
        return mdf
예제 #5
0
def messages_to_dataframe(messages):
    """
    Turn a list of parsed messages into a dataframe of message data,
    indexed by message-id, with column-names from headers.

    """

    def safe_unicode(t):
        return t and unicode(t, "utf-8", "ignore")

    # extract data into a list of tuples -- records -- with
    # the Message-ID separated out as an index
    pm = [
        (
            m.get("Message-ID"),
            safe_unicode(m.get("From")),
            safe_unicode(m.get("Subject")),
            get_date(m),
            safe_unicode(m.get("In-Reply-To")),
            safe_unicode(m.get("References")),
            get_text(m),
        )
        for m in messages
        if m.get("Message-ID")
    ]

    mdf = pd.DataFrame.from_records(
        list(pm),
        index="Message-ID",
        columns=["Message-ID", "From", "Subject", "Date", "In-Reply-To", "References", "Body"],
    )
    mdf.index.name = "Message-ID"

    return mdf
예제 #6
0
def compute_ascendancy(messages, duration=50):
    """Compute ascendancy given messages."""

    print("compute ascendancy")
    dated_messages = {}

    for m in messages:
        d = parse.get_date(m)

        if d is not None and d < datetime.datetime.now(pytz.utc):
            o = d.toordinal()
            dated_messages[o] = dated_messages.get(o, [])
            dated_messages[o].append(m)

    days = [k for k in list(dated_messages.keys())]
    day_offset = min(days)
    epoch = max(days) - min(days)

    ascendancy = np.zeros([max(days) - min(days) + 1])
    capacity = np.zeros(([max(days) - min(days) + 1]))

    for i in range(epoch):
        min_d = min(days) + i
        max_d = min_d + duration

        block_messages = []

        for d in range(min_d, max_d):
            block_messages.extend(dated_messages.get(d, []))

        b_IG = messages_to_interaction_graph(block_messages)
        b_matrix = interaction_graph_to_matrix(b_IG)

        ascendancy[min_d - day_offset] = ascendancy(b_matrix)
        capacity[min_d - day_offset] = capacity(b_matrix)

    return ascendancy, capacity
예제 #7
0
def messages_to_dataframe(messages):
    """
    Turn a list of parsed messages into a dataframe of message data,
    indexed by message-id, with column-names from headers.
    """
    # extract data into a list of tuples -- records -- with
    # the Message-ID separated out as an index
    #valid_messages = [m for m in messages if m.get()

    pm = [(m.get('Message-ID'), str(m.get('From')).replace('\\', ' '),
           str(m.get('Subject')), get_date(m), str(m.get('In-Reply-To')),
           str(m.get('References')), get_text(m)) for m in messages
          if m.get('From')]

    mdf = pd.DataFrame.from_records(list(pm),
                                    index='Message-ID',
                                    columns=[
                                        'Message-ID', 'From', 'Subject',
                                        'Date', 'In-Reply-To', 'References',
                                        'Body'
                                    ])
    mdf.index.name = 'Message-ID'

    return mdf
예제 #8
0
import pytz

url = "http://mail.scipy.org/pipermail/scipy-dev/"

messages = mailman.open_list_archives(url)

dates = []
froms = []
broke = []

for m in messages:
    m_from = m.get('From')
    froms.append(m_from)
    
    try:
        date = get_date(m)

        dates.append(date)

    except Exception as e:
        print e
        dates.append(pd.NaT)
        broke.append(m)

# just drop the missing values for now
data = pd.DataFrame({'Date':dates,'From':froms}).dropna()

# because sometimes somebody sends a messages from the future
data = data[data['Date'] < datetime.datetime.now(pytz.utc)]

### I've been having trouble getting traction with pandas