Example #1
0
def format_paragraphs(iterable):
    for row in iterable:
        for newrow in izip_longest(
                *list(readers.blocks_nosep(cell) for cell in row),
                fillvalue=' '
            ):
            yield (
                (cell.replace('\r', ' ').replace('\n', ' ') if cell else ' ')
                for cell in newrow
                )
Example #2
0
def QandA_splitquestions(iterable):
    for question, answer in iterable:
        q = list(
            flatten_paragraph(reformat(
                formats.link.remove(p)
                ))
            for p in
            remove_thank_blocks(readers.blocks_nosep(question.body))
            )
        a = list(remove_thank_blocks(readers.blocks_nosep(
            formats.link.shorten(answer.body)
            )))

        # first, try to match blockquotes against questions
        # and treat fully strong/emphasised paragraphs as such(courtesy of mojangles)
        quote = False
        last_quote = False
        q_ = []
        a_ = []
        for phrase, rphrase in ((phrase_.strip(), phrase_) for phrase_ in a):
            if is_blockquote_attempt(phrase, rphrase):
                phrase = phrase.lstrip('*> "').rstrip(' *"')
                if phrase in q or phrase in question.body:
                    if last_quote:
                        q_[-1].append(phrase)
                    else:
                        q_.append([phrase])
                    last_quote = quote = True
            elif quote:
                if last_quote:
                    a_.append([phrase])
                else:
                    a_[-1].append(phrase)
                last_quote = False
        if quote:
            if not (q_ and a_):
                continue
            try:
                q, a = [list(chain(*l)) for l in zip(
                        *[
                            (['  '.join(ques)] + [''] * (len(ans) - 1), ans)
                            for ques, ans in izip(q_, a_)
                            ]
                        )
                    ]
            except ValueError:
                print(q_, a_)
                raise
            diff = 0
        else:
            diff = len(q) - len(a)

        if len(q) == 1 or len(a) == 1:
            q = ['  '.join(q)]
            diff = 0

        if diff < 0:
            q_ = list(chain.from_iterable(questions(q__) for q__ in q))
            if not q_:
                a = [' '.join(a)]
                diff = 0
            else:
                diff_ = len(q_) - len(a)
                if not diff_:
                    q = q_
                    diff = 0

        if diff:
            words_q = list(words(phrase) for phrase in q)
            words_a = list(words(phrase) for phrase in a)

            scores = {}
            total = 0
            amount = 1
            for i, wordset in enumerate(words_q):
                for j, wordset_ in enumerate(words_a):
                    inter = wordset & wordset_
                    score = len(inter)
                    scores[(i,j)] = score
                    if score:
                        total += score
                        amount += 1

            avg = total / float(amount)

            q_ = []
            a_ = []
            positions = {}
            answer_positions = {}
            used_q = []
            used_a = []
            for (i, j), score in sorted(scores.items()):
                if score > avg:
                    pos = positions.get(i, None)
                    pos = answer_positions.get(j, pos)
                    if pos == None:
                        positions[i] = pos = len(q_)
                        q_.append([])
                        a_.append([])
                    if i not in used_q:
                        q_[pos].append(q[i])
                        used_q.append(i)
                    if j not in used_a:
                        a_[pos].append(a[j])
                        used_a.append(j)
                        answer_positions[j] = pos

            q = ('  '.join(phrase) for phrase in q_)
            a = ('  '.join(phrase) for phrase in a_)
            diff = 0

        q = filter(None, q)
        a = filter(None, a)
        if not (q and a):
            continue

        for q_, a_ in izip_longest(q, a, fillvalue=''):
            question_ = copy(question)
            question_.body = q_
            answer_ = copy(answer)
            answer_.body = reformat(a_)
            yield question_, answer_