def extract_pieces(msg_lines, msg_offset, mbx, inner_mesg=False): """Takes four parameters. The first is a list of line strings containing the headers and body of a message from a Eudora MBX file. The second is the offset of the first character in the first line in the msg_lines list within the MBX file we're processing. The third is the name of the MBX file we're reading. The fourth, inner_mesg, is a boolean controlling whether or not the pieces were are extracting are a top-level message in the MBX file. If inner_mesg is true, we will carry out our processing under the assumption that we are handling an attached message carried in an message/rfc822 segment. Returns a tuple (header, body, attachments, embeddeds, mbx) containing a Header object, a body String containing the body of the message, a list of attachment definition tuples, a list of embedded definition tuples, and the name of the MBX file we're processing.""" global toc_info, replies global target headers = Header() body = [] attachments = [] embeddeds = [] in_headers = True found_rfc822_inner_mesg = False is_html = False if not inner_mesg: headers.add('From ', msg_lines[0][5:].strip()) for line in msg_lines: if in_headers: if re_initial_whitespace.match(line): # Header "folding" (RFC 2822 3.2.3) headers.appendToLast(line) elif len(line.strip()) != 0: # Message header headers.add_line(line) attachment_matcher = re_x_attachment.match(line) if attachment_matcher: files = attachment_matcher.group(1) attach_list = re.split(';\s*', files) for attachment in attach_list: attachments.append((attachment, target)) else: # End of message headers. # scrub the header lines we've scanned if not inner_mesg: headers.clean(toc_info, msg_offset, replies) in_headers = False content_type = headers.getValue('Content-Type:') if content_type and content_type.lower() == 'message/rfc822': found_rfc822_inner_mesg = True print "+", elif found_rfc822_inner_mesg: # We're processing a message/rfc822 message, # and so we don't want to process attachments # at this level. Instead, we want to properly # extract all body lines for later processing body.append(strip_linesep(line) + "\n") else: # We're in the body of the text and we need to # handle attachments if not is_html and re_xhtml.search(line) or re_normal_html.search( line): is_html = True if attachments_dirs and re_attachment.search(line): # remove the newline that # Eudora inserts before the # 'Attachment Converted' line. if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'): body.pop() #EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype) attachments.append((line, target)) else: embedded_matcher = re_embedded.match(line) if embedded_matcher: filename = embedded_matcher.group(1) embeddeds.append(filename) else: orig_line = line if scrub_xflowed: line = re.sub(re_xflowed, '', line) line = re.sub(re_xhtml, '', line) line = re.sub(re_pete_stuff, '', line) if orig_line == line or line != '': body.append(strip_linesep(line) + "\n") return (headers, body, attachments, embeddeds, mbx, is_html)
def extract_pieces( msg_lines, msg_offset, mbx, inner_mesg=False ): """Takes four parameters. The first is a list of line strings containing the headers and body of a message from a Eudora MBX file. The second is the offset of the first character in the first line in the msg_lines list within the MBX file we're processing. The third is the name of the MBX file we're reading. The fourth, inner_mesg, is a boolean controlling whether or not the pieces were are extracting are a top-level message in the MBX file. If inner_mesg is true, we will carry out our processing under the assumption that we are handling an attached message carried in an message/rfc822 segment. Returns a tuple (header, body, attachments, embeddeds, mbx) containing a Header object, a body String containing the body of the message, a list of attachment definition tuples, a list of embedded definition tuples, and the name of the MBX file we're processing.""" global toc_info, replies global target headers = Header() body = [] attachments = [] embeddeds = [] in_headers = True found_rfc822_inner_mesg = False is_html = False if not inner_mesg: headers.add( 'From ', msg_lines[0][5:].strip() ) for line in msg_lines: if in_headers: if re_initial_whitespace.match( line ): # Header "folding" (RFC 2822 3.2.3) headers.appendToLast( line ) elif len( line.strip() ) != 0: # Message header headers.add_line(line) attachment_matcher = re_x_attachment.match( line ) if attachment_matcher: files = attachment_matcher.group(1) attach_list = re.split(';\s*', files) for attachment in attach_list: attachments.append( (attachment, target) ) else: # End of message headers. # scrub the header lines we've scanned if not inner_mesg: headers.clean(toc_info, msg_offset, replies) in_headers = False content_type = headers.getValue('Content-Type:') if content_type and content_type.lower() == 'message/rfc822': found_rfc822_inner_mesg = True print "+", elif found_rfc822_inner_mesg: # We're processing a message/rfc822 message, # and so we don't want to process attachments # at this level. Instead, we want to properly # extract all body lines for later processing body.append(strip_linesep(line) + "\n") else: # We're in the body of the text and we need to # handle attachments if not is_html and re_xhtml.search( line ) or re_normal_html.search( line ): is_html = True if attachments_dirs and re_attachment.search( line ): # remove the newline that # Eudora inserts before the # 'Attachment Converted' line. if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'): body.pop() #EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype) attachments.append( (line, target) ) else: embedded_matcher = re_embedded.match ( line ) if embedded_matcher: filename = embedded_matcher.group(1) embeddeds.append( filename ) else: orig_line = line if scrub_xflowed: line = re.sub(re_xflowed, '', line) line = re.sub(re_xhtml, '', line) line = re.sub(re_pete_stuff, '', line) if orig_line == line or line != '': body.append(strip_linesep(line) + "\n") return ( headers, body, attachments, embeddeds, mbx, is_html )
if sentence[i + 1]['POS'].startswith("T"): if sentence[i + 2]['POS'].startswith('N'): if sentence[i+2]['token'].lower() != \ sentence[i+2]['lemma'].split('-')[0].lower(): # set_trace() if sentence[i + 2]['token'].lower()[0] != sentence[ i + 2]['lemma'].split('-')[0].lower()[0]: eclip += 1 elif sentence[i + 2]['token'].lower()[1] == 'h': lenited += 1 return (eclip, lenited) featureFile = [] print "Creating Feature File" header.add("EMPTY") header.add("ECLIPSIS") header.add("LENITION") header.add("SUFFIX_COUNT") with open('featureFile.dat', 'w') as fo: i = 0 for dialect in l: for book in dialect: for sentence in book['sentences']: cnt = Counter() suff = checkSuff(sentence) cnt[header["SUFFIX_COUNT"]] = suff e_l = checkLenitedN(sentence) cnt[header["ECLIPSIS"]] = e_l[0] cnt[header["LENITION"]] = e_l[1] for word in sentence:
for i,word in enumerate(sentence): if word["POS"].startswith('S') and i+2 <leng: if sentence[i+1]['POS'].startswith("T"): if sentence[i+2]['POS'].startswith('N'): if sentence[i+2]['token'].lower() != \ sentence[i+2]['lemma'].split('-')[0].lower(): # set_trace() if sentence[i+2]['token'].lower()[0] != sentence[i+2]['lemma'].split('-')[0].lower()[0]: eclip+=1 elif sentence[i+2]['token'].lower()[1] == 'h': lenited +=1 return (eclip,lenited) featureFile = [] print "Creating Feature File" header.add("EMPTY") header.add("ECLIPSIS") header.add("LENITION") header.add("SUFFIX_COUNT") with open('featureFile.dat', 'w') as fo: i = 0 for dialect in l: for book in dialect: for sentence in book['sentences']: cnt = Counter() suff = checkSuff(sentence) cnt[header["SUFFIX_COUNT"]] = suff e_l = checkLenitedN(sentence) cnt[header["ECLIPSIS"]] = e_l[0] cnt[header["LENITION"]] = e_l[1] for word in sentence: