Example #1
0
def find_turns(topic, turn_section):
    #This is the regular expression to find the turn of a topic 
    #This is the regular expression to find out the user and date of a turn
    author_reg = '\[\[User(\s*talk)*:(?P<name>.*?)\|(?P<alias>.*?)\]\]'
    author_reg_c = re.compile(author_reg)
    user_date_reg = '\[\[User(\s*talk)*:(?P<name>.*?)'\
                '\|(?P<alias>.*?)\]\]'\
                '[\s\|]*(\(*\[\[.*\]\]\)*)*\s*'\
                '(?P<date>[a-zA-Z0-9i/,:\-\s]+)?'

    user_date_reg_c= re.compile(user_date_reg)

    #try to different turn indent from list indent
    list_reg = '[:\*#;\d]+'
    list_reg_c = re.compile(list_reg)
    #this indicate turn indent as a discussion thread 
    thread_reg = ':+'
    thread_reg_c = re.compile(thread_reg)
    #Find the potential turn list based on line break
    #The turn list returned mightnot be the actual one, because
    #some authors add linebreak in their messages
    #we may need to combine a few contiguous turns to form a 
    #complete turn
    #We know a turn is complete if we see user information at the 
    #end, or if the next candidate starts with ":", a wiki markup
    #indicating indent and follow up thread
    #
    turn_text_pre = ''
    turn_finish = True
    turn_list = turn_section.split('\n')
    for turn_text in turn_list:
        #pdb.set_trace()
        turn_text = turn_text.strip()
        if len(turn_text) == 0: 
            continue
    	user = user_date_reg_c.search(turn_text)
    	if user is not None:
            if turn_text.startswith(':'):
                if list_reg_c.match(turn_text) is  None:
                    if not turn_finish:
                        turn = Turn(turn_text_pre.strip())
                        topic.add_turn(turn)
                        turn_text_pre = ''
                        turn_finish = True
	    if not turn_finish:
	        turn_text = turn_text_pre + '\n' + turn_text
            #if there are multiple matching, we take th    e last one
            #pdb.set_trace()
            users = user_date_reg_c.finditer(turn_text)
            for match in users:
                date_str = match.group('date')
                author = match.group('name')
            date = None
            if date_str is not None:
                date = str_to_datetime(date_str)
            if date is not None:
                turn = Turn(turn_text.strip(), author,date)
            else:
                turn = Turn(turn_text.strip(), author)
            topic.add_turn(turn)
            turn_text_pre = ''
            turn_finish = True
        else:
            # a normal paragraph add it to previous one
            if not turn_text.startswith(':'):
                turn_text_pre = turn_text_pre+ '\n' + turn_text
                turn_finish = False
            else:
                # check if it is just a list
                if list_reg_c.match(turn_text) is not None:
                    turn_text_pre = turn_text_pre + '\n' + turn_text
                    turn_finish = False
                    # this is a new turn
                else:
                #check if there is a previous unfinished turn    
                    if not turn_finish:  
                        turn = Turn(turn_text_pre.strip())
                        topic.add_turn(turn)
                        #start the new turn
                        turn_text_pre = turn_text 
                    else: #this is another section of a previous turn
                        turn_text_pre = turn_text
                        turn_finish = False
                        
    if not turn_finish  and len(turn_text_pre)> 0:
        topic.add_turn(Turn(turn_text_pre.strip()))
    # build parent-child relationship
    # turns in a topic is added in time order 
    # after previous processing
    # some user use list mark up to indicate followup
    turns = topic.turns
    parent_stack = []
    for turn in turns:
        turn_text = turn.text
        indent = list_reg_c.match(turn_text)
        #pdb.set_trace()
        if indent is None:
            # start of a new thread
            if len(parent_stack) > 0:
                del parent_stack[:]
            parent_stack.append(turn)
        else:
            indent_str = indent.group()
            level = len(indent_str) # intended level
            #pdb.set_trace()
            current = len(parent_stack)-1 # current level
            if current < 0: # this is the first turn
                parent_stack.append(turn)
                continue
            if level > current:
                parent = parent_stack[current]
                parent.add_sub(turn)
                turn.add_parent(parent)
                parent_stack.append(turn)
            #    continue
            #if level == current: #same level,maybe a reply of previous one
                # pop the old one
             #    parent_stack.pop()
              #   parent = parent_stack[current]
              #   parent.add_sub(turn)
              #   turn.add_parent(parent)
              #   parent_stack.append(turn)
            else: # this is a reply of previous turn
                diff = current - level + 1
                for counter in range(diff):
                    parent_stack.pop()
                parent = parent_stack[len(parent_stack)-1]
                parent.add_sub(turn)
                turn.add_parent(parent)
                parent_stack.append(turn)
Example #2
0
# Test the regular expression to extract user and date from
# the talk page
import re
import datetime
import smart_date

f = open('talk_archive')
content = f.read()
user_date_reg = '\[\[User(\s*talk)*:(?P<name>.*?)'\
                '\|(?P<alias>.*?)\]\]'\
                '[\s\|]*(\[\[.*\]\])*\s*'\
                '(?P<date>[a-zA-Z0-9i/,:\-\s]+)'

user_date_reg_c= re.compile(user_date_reg)

result = user_date_reg_c.finditer(content)

for match in result:
    user = match.group('name')
    date_str = match.group('date')
    print 'User: '******'  Date: ' + date_str
    date = smart_date.str_to_datetime(date_str)
    if date is None:
        print 'Datetime is not in recognized format!'
    else:
        print date