Ejemplo n.º 1
0
 def test_latex_to_unicode(self):
     """textutils - latex_to_unicode"""
     self.assertEqual(
         translate_latex2unicode("\\'a \\'i \\'U").encode('utf-8'), "á í Ú")
     self.assertEqual(translate_latex2unicode("\\'N \\k{i}"),
                      u'\u0143 \u012f')
     self.assertEqual(translate_latex2unicode("\\AAkeson"), u'\u212bkeson')
     self.assertEqual(translate_latex2unicode("$\\mathsl{\\Zeta}$"),
                      u'\U0001d6e7')
Ejemplo n.º 2
0
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'):
    """
    Given a record and field tag, this function will modify the record by
    translating the subfield values of found fields from LaTeX to chosen
    encoding for all the subfields with given code (or all if no code is given).

    @param record: record to modify, in BibRec style structure
    @type record: dict

    @param tag: tag of fields to modify
    @type tag: string

    @param code: restrict the translation to a given subfield code
    @type code: string

    @param encoding: scharacter encoding for the new value. Defaults to UTF-8.
    @type encoding: string
    """
    field_list = record_get_field_instances(record, tag)
    for field in field_list:
        subfields = field[0]
        subfield_index = 0
        for subfield_code, subfield_value in subfields:
            if code == '' or subfield_code == code:
                newvalue = translate_latex2unicode(subfield_value).encode(encoding)
                record_modify_subfield(record, tag, subfield_code, newvalue, \
                                       subfield_index, field_position_global=field[4])
            subfield_index += 1
Ejemplo n.º 3
0
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'):
    """
    Given a record and field tag, this function will modify the record by
    translating the subfield values of found fields from LaTeX to chosen
    encoding for all the subfields with given code (or all if no code is given).

    @param record: record to modify, in BibRec style structure
    @type record: dict

    @param tag: tag of fields to modify
    @type tag: string

    @param code: restrict the translation to a given subfield code
    @type code: string

    @param encoding: scharacter encoding for the new value. Defaults to UTF-8.
    @type encoding: string
    """
    field_list = record_get_field_instances(record, tag)
    for field in field_list:
        subfields = field[0]
        subfield_index = 0
        for subfield_code, subfield_value in subfields:
            if code == '' or subfield_code == code:
                newvalue = translate_latex2unicode(subfield_value).encode(
                    encoding)
                record_modify_subfield(record, tag, subfield_code, newvalue, \
                                       subfield_index, field_position_global=field[4])
            subfield_index += 1
def process_author_name(author):
    """Convert author to INSPIRE form."""


    #test for ALLCAPS
    author = author.replace('YYYY', '')
    if re.search(r'[A-Z][A-Z]', author):
        author_uplow = ''
        for part in author.split(' '):
            if part.upper() == part and not re.match(r'I[IV]+', part):
                part = part.title()
            author_uplow += ' ' + part
        author = author_uplow
    author = author.replace('Inspire', 'INSPIRE')

    #print 'INPUT = ', author
    author = author.replace(r'\.', r'xxxx')
    author = author.replace(r'.', '. ')
    author = author.replace(r'xxxx', r'\.')
    author = re.sub('[ ]+', ' ', author)
    author = re.sub(r'\\(cor|corauth|fn)ref\{\w+\}', r'', author)
    author = re.sub(r'\}?\\thanks\{\\?.*\}?', r'', author)
    author = author.replace(r'\,', r'~')
    author = author.replace(r'\~', r'xxxx')
    author = author.replace(r'~', r' ')
    author = author.replace(r'xxxx', r'\~')
    #print 'MIDWAY1 =', author
    author = translate_latex2unicode(author)
    if '\\' in author:
        print 'Problem with', author
    author = author.replace(',', ', ')
    author = author.replace('.', '. ')
    author = re.sub(r'\s+', ' ', author)
    author = re.sub(r'\s+$', '', author)
    author = re.sub(r'^\s+', '', author)
    #print 'MIDWAY2 =', author
    match_object_1 = re.match(r'^(.*\w) ([IVJr\.]{2,}$)', author)
    match_object_2 = re.match(ur'(.*) (\(.*\))', author)
    if match_object_1 or match_object_2:
        if match_object_1:
            match_object = match_object_1
        elif match_object_2:
            match_object = match_object_2
        author = author_first_last(match_object.group(1)) + ', ' + \
                 match_object.group(2)
    else:
        author = author_first_last(author)
    author = author.replace(',', ', ')
    author = re.sub(r'\.\s+', '.', author)
    author = re.sub(r'\s+', ' ', author)
    author = re.sub(r'\s+$', '', author)
    author = re.sub(r'^\s+', '', author)



    #author = translate_latex2unicode(author)

    #print 'OUTPUT =', author
    return author
Ejemplo n.º 5
0
def get_captions(list_latex, list_pdf):
    """
	Function that takes all the caption from latex and pdf and creates a list for each case.
	Also it transforms all the latex simbols in unicode, removes any spaces or endline and
	removes Fig.<number>. Those transforms are used in the next step of matching captions.
	The original caption is obviously saved and put into MARCXML later.

	@param list_latex
	@param list_pdf

	@return caption_list_latex, caption_list_pdf, update_list_latex, update_list_pdf
	"""
    caption_list_latex = [figure.caption for figure in list_latex]
    caption_list_pdf = [figure.caption for figure in list_pdf]

    update_list_latex = [figure for figure in list_latex]
    update_list_pdf = [figure for figure in list_pdf]

    for index, caption in enumerate(caption_list_latex):
        # transform all latex simbols in unicode
        caption = translate_latex2unicode(caption)
        # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space
        caption = caption.lstrip().replace('\n', ' ')
        update_list_latex[index].caption = caption
        # skip the figure number
        #ur UNICODE RAW
        caption = re.sub(ur'^[\d]+ ', '', caption)
        # ignore unicode chars
        caption = caption.encode('ascii', 'ignore')
        # ignore some \\ fields
        caption = re.sub(r'\\[a-z]+', '', caption)
        # delete special chars
        for char in caption:
            if char in ' _^${}[]':
                caption = caption.replace(char, '')
        # convert again to unicode
        unicode(caption)
        caption_list_latex[index] = caption

    # Transformations (special characters and newlines)
    for index, caption in enumerate(caption_list_pdf):
        # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space
        caption = caption.lstrip().replace('\n', ' ')
        caption = strip_control_characters(caption)
        update_list_pdf[index].caption = caption
        # Skip Fig. number. at the beginning
        #ur UNICODE RAW
        caption = re.sub(ur'^Fig\. [\d]+\. ', '', caption)
        caption = re.sub(ur'^Figure [\d]+\: ', '', caption)

        #encode to ascii for eliminating unicode chars
        caption = caption.encode('ascii', 'ignore')
        caption = caption.replace(' ', '')
        #reconvert to unicode
        unicode(caption)
        caption_list_pdf[index] = caption

    return (caption_list_latex, caption_list_pdf, update_list_latex,
            update_list_pdf)
Ejemplo n.º 6
0
def process_author_name(author):
    """Convert author to INSPIRE form."""

    #test for ALLCAPS
    author = author.replace('YYYY', '')
    if re.search(r'[A-Z][A-Z]', author):
        author_uplow = ''
        for part in author.split(' '):
            if part.upper() == part and not re.match(r'I[IV]+', part):
                part = part.title()
            author_uplow += ' ' + part
        author = author_uplow
    author = author.replace('Inspire', 'INSPIRE')

    #print 'INPUT = ', author
    author = author.replace(r'\.', r'xxxx')
    author = author.replace(r'.', '. ')
    author = author.replace(r'xxxx', r'\.')
    author = re.sub('[ ]+', ' ', author)
    author = re.sub(r'\\(cor|corauth|fn)ref\{\w+\}', r'', author)
    author = re.sub(r'\}?\\thanks\{\\?.*\}?', r'', author)
    author = author.replace(r'\,', r'~')
    author = author.replace(r'\~', r'xxxx')
    author = author.replace(r'~', r' ')
    author = author.replace(r'xxxx', r'\~')
    #print 'MIDWAY1 =', author
    author = translate_latex2unicode(author)
    author = author.replace('\\"i', 'ï')
    if '\\' in author and not 'UTF8' in author:
        print 'Problem with', author
        sys.exit()
    author = author.replace(',', ', ')
    author = author.replace('.', '. ')
    author = re.sub(r'\s+', ' ', author)
    author = re.sub(r'\s+$', '', author)
    author = re.sub(r'^\s+', '', author)
    #print 'MIDWAY2 =', author
    match_object_1 = re.match(r'^(.*\w) ([IVJr\.]{2,}$)', author)
    match_object_2 = re.match(ur'(.*) (\(.*\))', author)
    if match_object_1 or match_object_2:
        if match_object_1:
            match_object = match_object_1
        elif match_object_2:
            match_object = match_object_2
        author = author_first_last(match_object.group(1)) + ', ' + \
                 match_object.group(2)
    else:
        author = author_first_last(author)
    author = author.replace(',', ', ')
    author = re.sub(r'\.\s+', '.', author)
    author = re.sub(r'\s+', ' ', author)
    author = re.sub(r'\s+$', '', author)
    author = re.sub(r'^\s+', '', author)

    #author = translate_latex2unicode(author)

    #print 'OUTPUT =', author
    return author
 def test_latex_to_unicode(self):
     """textutils - latex_to_unicode"""
     self.assertEqual(translate_latex2unicode("\\'a \\'i \\'U").encode('utf-8'), "á í Ú")
     self.assertEqual(translate_latex2unicode("\\'N \\k{i}"), u'\u0143 \u012f')
     self.assertEqual(translate_latex2unicode("\\AAkeson"), u'\u212bkeson')
     self.assertEqual(translate_latex2unicode("$\\mathsl{\\Zeta}$"), u'\U0001d6e7')
Ejemplo n.º 8
0
def create_xml(eprint=None, doi=None, author_dict=None):
    """Take in the author dictionary and write it out as xml."""
    if eprint:
        search = 'find eprint ' + eprint + ' or recid ' + eprint
        if '/' in eprint or '.' in eprint:
            search = 'find eprint ' + eprint
        recid = perform_request_search(p=search, cc='HEP') or \
                perform_request_search(p=search, cc='Fermilab')
        try:
            recid = recid[0]
        except IndexError:
            print 'Do not have eprint or recid', search
            return None
    elif doi:
        try:
            search = 'find doi ' + doi
            recid = perform_request_search(p=search, cc='HEP')[0]
        except IndexError:
            print 'Do not have doi', search
            return None
    record = {}
    record_add_field(record, '001', controlfield_value=str(recid))
    tag = '100__'
    #EMAIL_REGEX = re.compile(r"^[\w\-\.\'\+]+@[\w\-\.]+\.\w{2,4}$")
    #ORCID_REGEX = re.compile(r'^0000-\d{4}-\d{4}-\d{3}[\dX]$')
    #INSPIRE_REGEX = re.compile(r'^INSPIRE-\d{8}$')
    for key in author_dict:
        subfields = []
        author = author_dict[key][0]
        #print author_dict
        match_obj = re.search(ORCID_REGEX, author)
        if match_obj:
            orcid = match_obj.group(1)
            if not re.match(ORCID_REGEX, orcid):
                print '1 Problem with', orcid
            if ('j', 'ORCID:' + orcid) not in subfields:
                subfields.append(('j', 'ORCID:' + orcid))
            author = author.replace(orcid, '')
            author = author.replace('[]', '')
        match_obj = re.search(r'(INSPIRE-\d{8})', author)
        if match_obj:
            inspire = match_obj.group(1)
            if not re.match(INSPIRE_REGEX, inspire):
                print 'Problem with', inspire
            subfields.append(('i', inspire))
            author = author.replace(inspire, '')
            author = author.replace('[]', '')
        if u':' in author:
            match_obj = re.match(u'(.*):(.*)', author)
            author = match_obj.group(1)
            subfields.append(('q', match_obj.group(2)))
        subfields.append(('a', author))

        for affiliation in author_dict[key][1]:
            affiliation = re.sub(r'\\affinfn{(.*)}{(.*)}', r'INFN \1 \2',
                                 affiliation)
            affiliation = re.sub(r'\\affuni{(.*)}{(.*)}', r'\1 University \2',
                                 affiliation)
            affiliation = translate_latex2unicode(affiliation)
            #affiliation = re.sub(r'(\w)\W*$', r'\1', affiliation)
            affiliation = re.sub(r'([\.\,]+)', r'\1 ', affiliation)
            affiliation = re.sub(r'\s+', ' ', affiliation)
            affiliation = re.sub(r'\s$', r'', affiliation)
            affiliation = re.sub(r'\s*also at[\:\s]*', r'', affiliation,
                                 re.IGNORECASE)
            affiliation = re.sub(r'\s*\\and$', r'', affiliation)

            if r"@" in affiliation and r"0000-" in affiliation:
                affiliation = affiliation.replace(';', ' ')
                affiliation = affiliation.replace(r'. ', r'.')
                email = re.search(r"(\S+\@\S+)", affiliation).group(1)
                orcid = re.search(r"(0000-\S+)", affiliation).group(1)
                if re.match(EMAIL_REGEX, email):
                    subfields.append(('m', 'email:' + email))
                else:
                    print "Email problem:", email
                if re.match(ORCID_REGEX, orcid) and \
                   ('j', 'ORCID:' + orcid) not in subfields:
                    subfields.append(('j', 'ORCID:' + orcid))
                else:
                    print "ORCID problem:", orcid
                continue
            elif r"@" in affiliation:
                affiliation = affiliation.replace(r'. ', r'.')
                subfields.append(('m', affiliation))
                continue
            #elif re.match(r"^0000-0", affiliation):
            elif re.search(r"0000-0", affiliation):
                #print 'XXX', affiliation
                for aff in affiliation.split():
                    aff = re.sub(r'[^\d^\-^X]', '', aff)
                    orcid = re.search(ORCID_REGEX, aff)
                    if orcid:
                        orcid = orcid.group(0)
                        if ('j', 'ORCID:' + orcid) not in subfields:
                            subfields.append(('j', 'ORCID:' + orcid))
                        affiliation = re.sub(orcid, '', affiliation)
                        affiliation = re.sub(r'\s+$', '', affiliation)

                        #print 'YYY', affiliation

                        #subfields.append(('v', affiliation))
                        #break
                if not orcid:
                    print "ORCID problem:", affiliation
                #Removed this to process aff that contained ORCID
                #continue
            elif re.match(r"^INSPIRE-", affiliation):
                subfields.append(('i', affiliation))
                #Removed this to process aff that contained ORCID
                #continue

            affiliation = affiliation.replace('[]', '')
            if not affiliation:
                continue
            affiliation_key = re.sub(r'\W+', ' ', affiliation).upper()
            affiliation_key = re.sub(r'\s*(.+\S)\s*', r'\1', affiliation_key)
            try:
                for inst in AFFILIATIONS_DONE[affiliation_key]:
                    inst = re.sub(r'^\s+', '', inst)
                    if inst:
                        subfields.append(('u', inst))
            except KeyError:
                if False:
                    print "AFF in: ", affiliation, "*"
                    time1 = time.time()
                inspire_affiliation = get_aff(unidecode(affiliation))
                if False:
                    time2 = time.time()
                    time_taken = time2 - time1
                    print "AFF out:", inspire_affiliation, \
                          "Time taken", time_taken
                for inst in inspire_affiliation:
                    inst = re.sub(r'^\s+', '', inst)
                    if inst:
                        subfields.append(('u', inst))
                if not TEST:
                    AFFILIATIONS_DONE[affiliation_key] = inspire_affiliation
            if affiliation:
                subfields.append(('v', affiliation))
        record_add_field(record, tag[0:3], tag[3], tag[4], \
                         subfields=subfields)
        tag = '700__'
    return print_rec(record)
def create_xml(eprint=None, doi=None, author_dict=None):
    """Take in the author dictionary and write it out as xml."""
    if eprint:
        search = 'find eprint ' + eprint + ' or recid ' + eprint
        if '/' in eprint or '.' in eprint:
            search = 'find eprint ' + eprint
        recid = perform_request_search(p=search, cc='HEP') or \
                perform_request_search(p=search, cc='Fermilab')
        try:
            recid = recid[0]
        except IndexError:
            print 'Do not have eprint or recid', search
            return None
    elif doi:
        try:
            search = 'find doi ' + doi
            recid = perform_request_search(p=search, cc='HEP')[0]
        except IndexError:
            print 'Do not have doi', search
            return None
    record = {}
    record_add_field(record, '001', controlfield_value=str(recid))
    tag = '100__'
    email_regex = re.compile(r"^[\w\-\.\'\+]+@[\w\-\.]+\.\w{2,4}$")
    orcid_regex = re.compile(r'^0000-\d{4}-\d{4}-\d{3}[\dX]$')
    inspire_regex = re.compile(r'^INSPIRE-\d{8}$')
    for key in author_dict:
        subfields = []
        author = author_dict[key][0]

        match_obj = re.search(r'(0000-\d{4}-\d{4}-\d{3}[\dX])', author)
        if match_obj:
            orcid = match_obj.group(1)
            if not re.match(orcid_regex, orcid):
                print 'Problem with', orcid
            subfields.append(('j', 'ORCID:' + orcid))
            author = author.replace(orcid, '')
            author = author.replace('[]', '')
        match_obj = re.search(r'(INSPIRE-\d{8})', author)
        if match_obj:
            inspire = match_obj.group(1)
            if not re.match(inspire_regex, inspire):
                print 'Problem with', inspire
            subfields.append(('i', inspire))
            author = author.replace(inspire, '')
            author = author.replace('[]', '')
        subfields.append(('a', author))

        for affiliation in author_dict[key][1]:
            affiliation = re.sub(r'\\affinfn{(.*)}{(.*)}', r'INFN \1 \2',
                                 affiliation)
            affiliation = re.sub(r'\\affuni{(.*)}{(.*)}', r'\1 University \2',
                                 affiliation)
            affiliation = translate_latex2unicode(affiliation)
            #affiliation = re.sub(r'(\w)\W*$', r'\1', affiliation)
            affiliation = re.sub(r'([\.\,]+)', r'\1 ', affiliation)
            affiliation = re.sub(r'\s+', ' ', affiliation)
            affiliation = re.sub(r'\s$', r'', affiliation)
            affiliation = re.sub(r'\s*also at[\:\s]*', r'',  
                                 affiliation, re.IGNORECASE)
            affiliation = re.sub(r'\s*\\and$', r'', affiliation)

            if r"@" in affiliation and r"0000-" in affiliation:
                affiliation = affiliation.replace(';', ' ')
                affiliation = affiliation.replace(r'. ', r'.')
                email = re.search(r"(\S+\@\S+)", affiliation).group(1)
                orcid = re.search(r"(0000-\S+)", affiliation).group(1)
                if re.match(email_regex, email):
                    subfields.append(('m', 'email:' + email))
                else:
                    print "Email problem:", email
                if  re.match(orcid_regex, orcid):
                    subfields.append(('j', 'ORCID:' + orcid))
                else:
                    print "ORCID problem:", orcid
                continue
            elif r"@" in affiliation:
                affiliation = affiliation.replace(r'. ', r'.')
                subfields.append(('m', affiliation))
                continue
            elif re.match(r"^0000-0", affiliation):
                try:
                    orcid = re.search(r'(0000-\d{4}-\d{4}-\d{3}[\dX])',
                                  affiliation).group(1)
                    subfields.append(('j', 'ORCID:' + orcid))
                except AttributeError:
                    print "ORCID problem:", affiliation
                continue
            elif re.match(r"^INSPIRE-", affiliation):
                subfields.append(('i', affiliation))
                continue

            affiliation_key = re.sub(r'\W+', ' ', affiliation).upper()
            try:
                for inst in AFFILIATIONS_DONE[affiliation_key]:
                    inst = re.sub(r'^\s+', '', inst)
                    subfields.append(('u', inst))
            except KeyError:
                if False:
                    print "AFF in: ", affiliation, "*"
                    time1 = time.time()
                inspire_affiliation = get_aff(unidecode(affiliation))
                if False:
                    time2 = time.time()
                    time_taken = time2 - time1
                    print "AFF out:", inspire_affiliation, \
                          "Time taken", time_taken
                for inst in inspire_affiliation:
                    inst = re.sub(r'^\s+', '', inst)
                    subfields.append(('u', inst))
                if not TEST:
                    AFFILIATIONS_DONE[affiliation_key] = inspire_affiliation
            subfields.append(('v', affiliation))
        record_add_field(record, tag[0:3], tag[3], tag[4], \
                         subfields=subfields)
        tag = '700__'
    return print_rec(record)
Ejemplo n.º 10
0
def similarity_between_caption1_and_caption2(caption1, caption2, list1, list2):
    """
	Function that takes two lists and two captions and returns if the caption 1 is matching caption 2
	
	@param caption1: caption from pdf source
	@param caption2: caption from latex source
	@param list1: list of pdf figures
	@param list2: list of latex figures
	
	@return: 0 if caption1 matches caption2, -1 else
	"""

    caption_list1 = [element.caption for element in list1]
    caption_list2 = [element.caption for element in list2]

    # Transformations (special characters and newlines)
    for index, caption in enumerate(caption_list1):
        # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space
        caption = caption.lstrip().replace('\n', ' ')
        # when encounter at the beginning the regular expresion: Fig. number. => skip
        #ur UNICODE RAW
        caption = re.sub(ur'^Fig\. [\d]+\. ', '', caption)
        caption_list1[index] = caption
    print caption_list1
    for index, caption in enumerate(caption_list2):
        # transform all latex simbols in unicode
        caption = translate_latex2unicode(caption)
        # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space
        caption = caption.lstrip().replace('\n', ' ')
        # when encounter at the beginning the regular expresion: number => skip
        #ur UNICODE RAW
        caption = re.sub(ur'^[\d]+ ', '', caption)
        caption_list2[index] = caption
    print caption_list2

    # Long common subsequence
    # Levenshtein distance
    dictionary = {}
    dictionary2 = {}
    dictionary3 = {}

    for i in range(len(caption_list1)):
        distances = []
        lcss = []
        for j in range(len(caption_list2)):
            distance = levenshtein(caption_list1[i], caption_list2[j])
            distances.append(distance)
            X = caption_list1[i]
            Y = caption_list2[j]
            m = len(X)
            n = len(Y)
            C = LCS(X, Y)
            lcs = backTrack(C, X, Y, m, n)
            lcss.append(lcs)

        print distances
        max_distance = 0
        index_max_elem_list = []
        for k in range(len(lcss)):
            if len(lcss[k]) > max_distance:
                max_distance = len(lcss[k])
                index_max = k
        index_max_elem_list.append(index_max)
        # if there are equal distances
        n = 0
        for k in range(len(lcss)):
            if len(lcss[k]) == max_distance:
                if (n != 0):
                    index_max_elem_list.append(k)
                n = 1

        min_distance = 100000
        index_min_elem_list = []
        index_min_direct_comparison = []

        for k in range(len(distances)):
            if distances[k] == 0:
                index_min_direct_comparison.append(k)
            if distances[k] < min_distance:
                min_distance = distances[k]
                index_min = k
        index_min_elem_list.append(index_min)
        # if there are equal distances
        n = 0
        for k in range(len(distances)):
            if distances[k] == min_distance:
                if (n != 0):
                    index_min_elem_list.append(k)
                n = 1

        dictionary[i] = index_min_elem_list
        print dictionary

        dictionary2[i] = index_max_elem_list
        print dictionary2

        dictionary3[i] = index_min_direct_comparison
        print dictionary3

    for caption in caption_list1:
        if caption == caption1:
            index_caption1 = caption_list1.index(caption)
    for caption in caption_list2:
        if caption == caption2:
            index_caption2 = caption_list2.index(caption)
    if index_caption2 in dictionary[index_caption1]:
        if index_caption2 in dictionary2[index_caption1]:
            #		if index_caption2 in dictionary3[index_caption1]
            return 0
    return -1
Ejemplo n.º 11
0
def similarity(list_latex, list_pdf):
    """
	The function that takes two lists of figures and detects the matches between them
	
	@param list_latex: the list of latex figures
	@param list_pdf: the list of pdf figures
	
	@return: the matching tuples
	"""
    caption_list_latex = [figure.caption for figure in list_latex]
    caption_list_pdf = [figure.caption for figure in list_pdf]

    for index, caption in enumerate(caption_list_latex):
        # transform all latex simbols in unicode
        caption = translate_latex2unicode(caption)
        # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space
        caption = caption.lstrip().replace('\n', ' ')
        list_latex[index].caption = caption
        # when encounter at the beginning the regular expresion: number => skip
        #ur UNICODE RAW
        caption = re.sub(ur'^[\d]+ ', '', caption)
        caption_list_latex[index] = caption

    # Transformations (special characters and newlines)
    for index, caption in enumerate(caption_list_pdf):
        # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space
        caption = caption.lstrip().replace('\n', ' ')
        caption = strip_control_characters(caption)
        list_pdf[index].caption = caption
        # when encounter at the beginning the regular expresion: Fig. number. => skip
        #ur UNICODE RAW
        caption = re.sub(ur'^Fig\. [\d]+\. ', '', caption)
        caption = re.sub(ur'^Figure [\d]+\: ', '', caption)
    # used for levenshtein distance
    dictionary = {}
    # used for longest common subsequence
    dictionary2 = {}
    # used for direct comparison
    dictionary3 = {}

    for i in range(len(caption_list_latex)):
        distances = []
        lcss = []
        for j in range(len(caption_list_pdf)):
            distance = levenshtein(caption_list_latex[i], caption_list_pdf[j])
            distances.append(distance)
            X = caption_list_latex[i]
            Y = caption_list_pdf[j]
            m = len(X)
            n = len(Y)
            C = LCS(X, Y)
            lcs = iterative(C, X, Y, m, n)
            lcss.append(lcs)

        max_distance = 0
        index_max = 0
        # the list we use in representing the longest common subsequence
        index_max_elem_list = []
        for k in range(len(lcss)):
            if len(lcss[k]) > max_distance:
                max_distance = len(lcss[k])
                index_max = k
        index_max_elem_list.append(index_max)
        # if there are equal distances
        n = 0
        for k in range(len(lcss)):
            if len(lcss[k]) == max_distance:
                if (n != 0):
                    index_max_elem_list.append(k)
                n = 1

        min_distance = 100000
        # the list we use in representing the levenshtein distance
        index_min_elem_list = []
        # the list for direct comparison
        index_min_direct_comparison = []

        for k in range(len(distances)):
            if distances[k] == 0:
                index_min_direct_comparison.append(k)
            if distances[k] < min_distance:
                min_distance = distances[k]
                index_min = k
        if len(distances) != 0:
            index_min_elem_list.append(index_min)
        # if there are equal distances
        n = 0
        for k in range(len(distances)):
            if distances[k] == min_distance:
                if (n != 0):
                    index_min_elem_list.append(k)
                n = 1

        dictionary[i] = index_min_elem_list
        print dictionary

        dictionary2[i] = index_max_elem_list
        print dictionary2

        dictionary3[i] = index_min_direct_comparison
        print dictionary3

    tuples = []
    for i in range(len(dictionary)):
        for j in range(len(dictionary.values()[i])):
            if (dictionary.values()[i] == dictionary2.values()[i]):
                # if dictionary.values()[i] == dictionary3.values()[i]
                a_tuple = i, dictionary.values()[i][j]
                tuples.append(a_tuple)
    return tuples