Exemple #1
0
def gui_parse():

    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2

    global debug

    root = Tk()
    root.withdraw()

    sys.stderr = codecs.getwriter('utf8')(sys.stderr)

    parser = argparse.ArgumentParser()
    parser.add_argument('filename', nargs='?')
    parser.add_argument('--debug', '-d', action='store_true')
    args = parser.parse_args()

    if args.debug:
        debug = True

    if args.filename is None:
        args.filename = tkFileDialog.askopenfilename(
            filetypes=[
            ('Word 2007+','*.docx'),
            ('Plain text','*.txt'),
            ])

    os.chdir(os.path.dirname(os.path.abspath(args.filename)))

    if os.path.splitext(args.filename)[1] == '.txt':

        with codecs.open(args.filename, 'r', 'utf8') as input_file:
                input_text = input_file.read()

        input_text = input_text.replace('\r','')

        final_structure = chgk_parse(input_text)


    elif os.path.splitext(args.filename)[1] == '.docx':
        from pydocx import PyDocX
        from bs4 import BeautifulSoup
        from parse import parse
        import base64
        import html2text
        input_docx = PyDocX.to_html(args.filename)
        bsoup = BeautifulSoup(input_docx)

        if args.debug:
            with codecs.open('debug.pydocx', 'w', 'utf8') as dbg:
                dbg.write(input_docx)
        
        def generate_imgname(ext):
            imgcounter = 1
            while os.path.isfile('{:03}.{}'
                .format(imgcounter, ext)):
                imgcounter += 1
            return '{:03}.{}'.format(imgcounter, ext)

        for tag in bsoup.find_all('style'):
            tag.extract()
        for tag in bsoup.find_all('p'):
            if tag.string:
                tag.string = tag.string + SEP
        for tag in bsoup.find_all('b'):
            tag.unwrap()
        for tag in bsoup.find_all('strong'):
            tag.unwrap()
        for tag in bsoup.find_all('i'):
            tag.string = '_' + tag.string + '_'
            tag.unwrap()
        for tag in bsoup.find_all('em'):
            tag.string = '_' + tag.string + '_'
            tag.unwrap()
        for tag in bsoup.find_all('li'):
            if tag.string:
                tag.string = '- ' + tag.string
        for tag in bsoup.find_all('img'):
            imgparse = parse('data:image/{ext};base64,{b64}', tag['src'])
            imgname = generate_imgname(imgparse['ext'])
            tag.insert_before('(img {})'.format(imgname))
            if not args.debug:
                with open(imgname, 'wb') as f:
                    f.write(base64.b64decode(imgparse['b64']))
            tag.extract()
        for tag in bsoup.find_all('a'):
            if rew(tag.string) == '':
                tag.extract()
            else:
                tag.string = tag['href']
                tag.unwrap()

        h = html2text.HTML2Text()
        h.body_width = 0
        txt = (h.handle(bsoup.prettify())
            .replace('\\-','')
            .replace('\\.','.')
            .replace('( ', '(')
            .replace('[ ', '[')
            .replace(' )', ')')
            .replace(' ]', ']')
            .replace(' :', ':')
            )

        if args.debug:
            with codecs.open('debug.debug', 'w', 'utf8') as dbg:
                dbg.write(txt)

        final_structure = chgk_parse(txt)

    else:
        sys.stderr.write('Error: unsupported file format.' + SEP)
        sys.exit()

    os.chdir(os.path.dirname(os.path.abspath(__file__)))

    with codecs.open(
        make_filename(args.filename), 'w', 'utf8') as output_file:
        output_file.write(
            compose_4s(final_structure))

    print('Please review the resulting file {}:'.format(
        make_filename(args.filename)))
    subprocess.call(shlex.split('{} "{}"'
        .format(
            TEXTEDITOR,
            make_filename(args.filename)).encode('cp1251',errors='replace')))
Exemple #2
0
def chgk_parse(text):

    """
    Parsing rationale: every Question has two required fields: 'question' and
    the immediately following 'answer'. All the rest are optional, as is
    the order of these fields. On the other hand, everything
    except the 'question' is obligatorily marked, while the 'question' is
    optionally marked. But IF the question is not marked, 'meta' comments
    between Questions will not be parsed as 'meta' but will be merged to
    'question's.
    Parsing is done by regexes in the following steps:

    1. Identify all the fields you can, mark them with their respective
        labels, mark all the others with ''
    2. Merge fields inside Question with '' lines between them
    3. Ensure every 'answer' has a 'question'
    4. Mark all remaining '' fields as 'meta'
    5. Prettify input
    6. Pack Questions into dicts
    7. Return the resulting structure

    """

    BADNEXTFIELDS = set(['question', 'answer'])

    WHITESPACE = set([' ', ' ', '\n', '\r'])
    PUNCTUATION = set([',', '.', ':', ';', '?', '!'])

    re_tour = re.compile(r'^ТУР ?([0-9IVXLCDM]*)([\.:])?$', re.I | re.U)
    re_tourrev = re.compile(r'^([0-9]+) ТУР([\.:])?$', re.I | re.U)
    re_question = re.compile(r'ВОПРОС ?[№N]?([0-9]*) ?[\.:]', re.I | re.U)
    re_answer = re.compile(r'ОТВЕТЫ? ?[№N]?([0-9]+)? ?[:]', re.I | re.U)
    re_zachet = re.compile(r'ЗАЧ[ЕЁ]Т ?[\.:]', re.I | re.U)
    re_nezachet = re.compile(r'НЕЗАЧ[ЕЁ]Т ?[\.:]', re.I | re.U)
    re_comment = re.compile(r'КОММЕНТАРИ[ИЙ] ?[№N]?([0-9]+)? ?[\.:]', re.I | re.U)
    re_author = re.compile(r'АВТОР\(?Ы?\)? ?[\.:]', re.I | re.U)
    re_source = re.compile(r'ИСТОЧНИК\(?И?\)? ?[\.:]', re.I | re.U)
    re_editor = re.compile(r'РЕДАКТОР(Ы|СКАЯ ГРУППА)? ?[\.:]', re.I | re.U)
    re_date = re.compile(r'ДАТА ?[\.:]', re.I | re.U)
    re_handout = re.compile(r'РАЗДА(ЧА|ТКА|ТОЧНЫЙ МАТЕРИАЛ) ?[\.:]', re.I | re.U)
    re_number = re.compile(r'^[0-9]+[\.\)] *')

    regexes = {
        'tour' : re_tour,
        'tourrev' : re_tourrev,
        'question' : re_question,
        'answer' : re_answer,
        'zachet' : re_zachet,
        'nezachet' : re_nezachet,
        'comment' : re_comment,
        'author' : re_author,
        'source' : re_source,
        'editor' : re_editor,
        'date' : re_date,
    }

    chgk_parse.structure = []

    def merge_to_previous(index):
        target = index - 1
        chgk_parse.structure[target][1] = (
            chgk_parse.structure[target][1] + SEP 
            + chgk_parse.structure.pop(index)[1])

    def merge_to_next(index):
        target = chgk_parse.structure.pop(index)
        chgk_parse.structure[index][1] = (target[1] + SEP 
            + chgk_parse.structure[index][1])

    def find_next_specific_field(index, fieldname):
        target = index + 1
        while chgk_parse.structure[target][0] != fieldname:
            target += 1
        return target

    def find_next_fieldname(index):
        target = index + 1
        if target < len(chgk_parse.structure):
            debug_print(pprint.pformat(
                chgk_parse.structure[target]))
            while (target < len(chgk_parse.structure)-1
                and chgk_parse.structure[target][0] == ''):
                target += 1
            return chgk_parse.structure[target][0]

    def merge_y_to_x(x, y):
        i = 0
        while i < len(chgk_parse.structure):
            if chgk_parse.structure[i][0] == x:
                while (i+1 < len(chgk_parse.structure) 
                    and chgk_parse.structure[i+1][0] != y):
                    merge_to_previous(i+1)
            i += 1

    def merge_to_x_until_nextfield(x):
        i = 0
        while i < len(chgk_parse.structure):
            if chgk_parse.structure[i][0] == x:
                while (i+1 < len(chgk_parse.structure) 
                    and chgk_parse.structure[i+1][0] == ''
                    and find_next_fieldname(i) not in BADNEXTFIELDS):
                    merge_to_previous(i+1)
            i += 1

    def dirty_merge_to_x_until_nextfield(x):
        i = 0
        while i < len(chgk_parse.structure):
            if chgk_parse.structure[i][0] == x:
                while (i+1 < len(chgk_parse.structure) 
                    and chgk_parse.structure[i+1][0] == ''):
                    merge_to_previous(i+1)
            i += 1

    def swap_elements(x, y):
        z = chgk_parse.structure[y]
        chgk_parse.structure[y] = chgk_parse.structure[x]
        chgk_parse.structure[x] = z

    # 1.

    for x in re.split(r'\r?\n',text):
        if x != '':
            chgk_parse.structure.append(['',rew(x)])

    i = 0
    st = chgk_parse.structure
    while i < len(st):
        matching_regexes = {(regex, regexes[regex].search(st[i][1]).start(0)) 
        for regex in regexes if regexes[regex].search(st[i][1])}
        
        # If more than one regex matches string, split it and 
        # insert into structure separately.
        
        if len(matching_regexes) == 1: 
            st[i][0] = matching_regexes.pop()[0]
        elif len(matching_regexes) > 1:
            sorted_r = sorted(matching_regexes, key=lambda x: x[1])
            slices = []
            for j in range(1, len(sorted_r)):
                slices.append(
                    [sorted_r[j][0], st[i][1][
                        sorted_r[j][1] 
                         : 
                        sorted_r[j+1][1] if j+1 < len(sorted_r)
                                                else len(st[i][1])]])
            for slice_ in slices:
                chgk_parse.structure.insert(
                    i+1, slice_)
            st[i][0] = sorted_r[0][0]
            st[i][1] = st[i][1][:sorted_r[1][1]]
        i += 1
    chgk_parse.structure = st
    i = 0
        

    # 2.

    merge_y_to_x('question','answer')
    merge_to_x_until_nextfield('answer')
    merge_to_x_until_nextfield('comment')

    # 3.

    i = 0
    while i < len(chgk_parse.structure):
        if (chgk_parse.structure[i][0] == 'answer' 
            and chgk_parse.structure[i-1][0] not in ('question',
                'newquestion')):
            chgk_parse.structure.insert(i,['newquestion',''])
            i = 0
        i += 1
    
    i = 0
    while i < len(chgk_parse.structure) - 1:
        if (chgk_parse.structure[i][0] == ''
            and chgk_parse.structure[i+1][0] == 'newquestion'):
            merge_to_next(i)
            if (re_number.search(
                            rew(chgk_parse.structure[i][1])) and
            not re_number.search(
                rew(chgk_parse.structure[i-1][1]))):
                chgk_parse.structure[i][0] = 'question'
                chgk_parse.structure[i][1] = re_number.sub('',rew(
                    chgk_parse.structure[i][1]))
                try:
                    chgk_parse.structure.insert(i, ['number', int(re_number.search(
                            rew(chgk_parse.structure[i][1])).group(0))])
                except:
                    pass
            i = 0
        i += 1

    for element in chgk_parse.structure:
        if element[0] == 'newquestion':
            element[0] = 'question'

    dirty_merge_to_x_until_nextfield('source')

    for id, element in enumerate(chgk_parse.structure):
        if (element[0] == 'author' and re.search(r'^{}$'.format(re_author.
            pattern),
            rew(element[1]))
            and id + 1 < len(chgk_parse.structure)):
            merge_to_previous(id+1)
    
    merge_to_x_until_nextfield('zachet')
    merge_to_x_until_nextfield('nezachet')
    
    # 4.

    chgk_parse.structure = [x for x in chgk_parse.structure if [x[0], rew(x[1])]
        != ['', '']]

    if chgk_parse.structure[0][0] == '' and re_number.search(
        rew(chgk_parse.structure[0][1])):
        merge_to_next(0)

    for id, element in enumerate(chgk_parse.structure):
        if element[0] == '':
            element[0] = 'meta'
        if element[0] in regexes and element[0] not in ['tour', 'tourrev']:
            if element[0] == 'question':
                try:
                    num = re_question.search(element[1]).group(1)
                    chgk_parse.structure.insert(id, ['number', num])
                except:
                    pass
            element[1] = regexes[element[0]].sub('', element[1])

    # 5.

    for id, element in enumerate(chgk_parse.structure):
        
        # typogrify

        if element[0] != 'date':
            element[1] = typotools.recursive_typography(element[1])

        # remove question numbers

        if element[0] == 'question':
            try:
                num = re_question.search(element[1]).group(1)
                chgk_parse.structure.insert(id, ['number', num])
            except:
                pass
            element[1] = re_number.sub('', element[1])
        
        # detect inner lists

        mo = {m for m 
            in re.finditer(r'(\s+|^)(\d+)[\.\)]\s*(?!\d)',element[1], re.U)}
        if len(mo) > 1:
            sorted_up = sorted(mo, key=lambda m: int(m.group(2)))
            j = 0
            list_candidate = []
            while j == int(sorted_up[j].group(2)) - 1:
                list_candidate.append((j+1, sorted_up[j].group(0), 
                    sorted_up[j].start()))
                if j+1 < len(sorted_up):
                    j += 1
                else:
                    break
            if len(list_candidate) > 1:
                if (element[0] != 'question' or 
                    (element[0] == 'question'
                        and 'дуплет' in element[1].lower() 
                            or 'блиц' in element[1].lower())):
                    part = partition(element[1], [x[2] for x in
                        list_candidate])
                    lc = 0
                    while lc < len(list_candidate):
                        part[lc+1] = part[lc+1].replace(list_candidate[lc][1], '')
                        lc += 1
                    element[1] = ([part[0], part[1:]] if part[0] != ''
                                            else part[1:])

        # turn source into list if necessary

        if (element[0] == 'source' and isinstance(element[1], basestring)
                    and len(re.split(r'\r?\n', element[1])) > 1):
            element[1] = [re_number.sub('', rew(x)) 
                for x in re.split(r'\r?\n', element[1])]




    # 6.

    final_structure = []
    current_question = {}

    for element in chgk_parse.structure:
        if element[0] in set(['tour', 'question', 'meta']): 
            if current_question != {}:
                check_question(current_question)
                final_structure.append(['Question', current_question])
                current_question = {}
        if element[0] in QUESTION_LABELS:
            if element[0] in current_question:
                try:
                    current_question[element[0]] += SEP + element[1]
                except:
                    print('{}'.format(current_question).decode('unicode_escape'))
                    pdb.set_trace()
            else:
                current_question[element[0]] = element[1]
        else:
            final_structure.append([element[0], element[1]])
    if current_question != {}:
        check_question(current_question)
        final_structure.append(['Question', current_question])


    # 7.

    debug_print(pprint.pformat(final_structure).decode('unicode_escape'))
    return final_structure
Exemple #3
0
def parse_4s(s):
    mapping = {
        '#' : 'meta',
        '##' : 'section',
        '###' : 'heading',
        '###LJ': 'ljheading',
        '#EDITOR': 'editor',
        '#DATE': 'date',
        '?': 'question',
        '№': 'number',
        '№№': 'setcounter',
        '!': 'answer',
        '=': 'zachet',
        '!=': 'nezachet',
        '^': 'source',
        '/': 'comment',
        '@': 'author',
        '>': 'handout',
    }

    structure = []

    if s[0] == '\ufeff' and len(s) > 1:
        s = s[1:]

    with codecs.open('raw.debug', 'w', 'utf8') as debugf:
        debugf.write(pprint.pformat(s.split('\n')).decode('unicode_escape'))
    
    for line in s.split('\n'):
        if rew(line) == '':
            structure.append(['', ''])
        else:
            if line.split()[0] in mapping:
                structure.append([mapping[line.split()[0]], 
                    rew(line[
                        len(line.split()[0]):])])
            else:
                if len(structure) > 1:
                    structure[len(structure)-1][1] += '\n' + line

    final_structure = []
    current_question = {}
    counter = 1

    if debug:
        with codecs.open('debug1st.debug', 'w', 'utf8') as debugf:
            debugf.write(pprint.pformat(structure).decode('unicode_escape'))

    for element in structure:
        
        # find list in element

        sp = element[1].split('\n')
        if len(sp) > 1:
            list_candidate = []
            
            for line in sp:
                if len(rew(line).split())>1 and rew(line).split()[0] == '-':
                    list_candidate.append(
                        rew(
                            rew(
                            line
                            )[1:]
                        ))
            
            sp = [spsp for spsp in sp if rew(rew(spsp)[1:]) not in list_candidate]
            
            if len(sp) == 0 or len(sp) == 1 and sp[0] == '':
                element[1] = list_candidate
            else:
                element[1] = (['\n'.join(sp), list_candidate] 
                    if len(list_candidate)>1 
                    else '\n'.join(element[1].split('\n')))

        if element[0] in QUESTION_LABELS:
            if element[0] in current_question:
                
                if (isinstance(current_question[element[0]], basestring)
                    and isinstance(element[1], basestring)):
                    current_question[element[0]] += '\n' + element[1]
                
                elif (isinstance(current_question[element[0]], list)
                    and isinstance(element[1], basestring)):
                    current_question[element[0]][0] += '\n' + element[1]
                
                elif (isinstance(current_question[element[0]], basestring)
                    and isinstance(element[1], list)):
                    current_question[element[0]] = [element[1][0] + '\n'
                        + current_question[element[0]], element[1][1]]
                
                elif (isinstance(current_question[element[0]], list)
                    and isinstance(element[1], list)):
                    current_question[element[0]][0] += '\n' + element[1][0]
                    current_question[element[0]][1] += element[1][1]
            else:
                current_question[element[0]] = element[1]
        
        elif element[0] == '':
            
            if current_question != {}:
                assert all(True for label in REQUIRED_LABELS 
                    if label in current_question)
                if 'setcounter' in current_question:
                    counter = int(current_question['setcounter'])
                if not 'number' in current_question:
                    current_question['number'] = counter
                    counter += 1
                final_structure.append(['Question', current_question])
            
            current_question = {}

        else:
            final_structure.append([element[0], element[1]])
    
    if current_question != {}:
        assert all(True for label in REQUIRED_LABELS 
                if label in current_question)
        if 'setcounter' in current_question:
            counter = int(current_question['setcounter'])
        if not 'number' in current_question:
            current_question['number'] = counter
            counter += 1
        final_structure.append(['Question', current_question])

    if debug:
        with codecs.open('debug.debug', 'w', 'utf8') as debugf:
            debugf.write(pprint.pformat(final_structure))

    return final_structure