def __init__(self, entityparams=ee.ALL):
     self.params = entityparams
     self.entity_extractor = ee.EntityExtractor()
     self.meta_extractor = MetaExtractor()
     self.cita_parser = CitationEntityExtractor(self.params)
     self.document_wrapper = DocumentWrapper()
     self.textual_document = TextualDocument()
     self.email_extractor = ee.EntityExtractor.EmailExtractor()
     self.document_info = DocumentInfo()
     self.cleaner = TextCleaner()
     self.lang_identifier = LanguageIdentifier()
class ArticleMetaExtractor(object):
    """
    This class extracts metadata from articles.
    """

    def __init__(self, entityparams=ee.ALL):
        self.params = entityparams
        self.entity_extractor = ee.EntityExtractor()
        self.meta_extractor = MetaExtractor()
        self.cita_parser = CitationEntityExtractor(self.params)
        self.document_wrapper = DocumentWrapper()
        self.textual_document = TextualDocument()
        self.email_extractor = ee.EntityExtractor.EmailExtractor()
        self.document_info = DocumentInfo()
        self.cleaner = TextCleaner()
        self.lang_identifier = LanguageIdentifier()


    def _assign_emails(self, emails, names):
        """
        This method assigns email adresses to correct names.
        
        @param emails: list of emails
        @type emails: [RRSEmail]
        @param names: list of person names
        @type names: [RRSPerson]
        @return: list of person names with emails
        @rtype: [RRSPerson]  
        """

        names_tmp = []
        assigned_emails = []
        emails_tmp = []

        for a in names:
            names_tmp.append(a.get('full_name'))

        for r in emails:
            emails_tmp.append(r.get_localpart() + '@' + r.get_domain())

        #Zacne prirazovat, pokud je vubec extrahovany nejaky autor:
        if len(names_tmp) != 0:
            pr_names = names_tmp[:]
            pr_emails = emails_tmp[:]
            names_forms = []

            #Na zacatek prirazenych emailu vlozi pocet shodnych retezcu rovny 0
            for i in range(0, len(names_tmp)):
                assigned_emails.append("0|")

            #Upravi name autora:
            re_dot = re.compile('\.')
            re_end = re.compile(' $')
            re_start = re.compile('[^A-Za-z ]')
            re_firstname = re.compile('^([A-Z][a-z]*.*) ([A-Z][A-Za-z]*)?')
            re_surname = re.compile('([A-Z][a-z]*.*) ([A-Z][A-Za-z]*)$')
            for i in range(0, len(pr_names)):
                pr_names[i] = re_dot.sub('\. ', pr_names[i])
                pr_names[i] = re_end.sub('$', pr_names[i])
                pr_names[i] = re_start.sub("", pr_names[i])

                if names_tmp[i] == "":
                    break

                #Rozdeli name na krestni name a prsurname
                if re_firstname.search(pr_names[i]):
                    name = re_firstname.search(pr_names[i]).group(1)
                else:
                    name = pr_names[i]
                if re_surname.search(pr_names[i]):
                    surname = re_surname.search(pr_names[i]).group(2)
                else:
                    surname = pr_names[i]

                #Kazdy autor bude mit svuj seznam rezezcu:
                name = re.sub(' ', "", name)
                for k in range(0, len(surname) + 1):
                    for j in range(0, len(name) + 1):
                        names_forms.append(name[0:len(name) - j] + 
                                           surname[0:len(surname) - k])
                pr_names[i] = names_forms[:]
                names_forms = []

            #Upravi emails:
            re_at = re.compile('(.*)(@)')
            re_em_start = re.compile('[^A-Za-z]')
            for i in range(0, len(pr_emails)):
                pr_emails[i] = re_at.search(pr_emails[i]).group(1)
                pr_emails[i] = re_em_start.sub("", pr_emails[i])

            #Priradi emails ke jmenum podle nejvyssiho poctu shodnych retezcu:
            re_num = re.compile('^([0-9]+)(|)')
            i = 0

            while i < len(pr_emails):
                max_p, max_j = 0, 0
                for j in range(0, len(names_tmp)):
                    poc = 0
                    len_pr_names = len(pr_names[j])
                    for k in range(0, len_pr_names):
                        if re.search('' + re.escape(pr_names[j][k]) + '',
                                     pr_emails[i], re.IGNORECASE):
                            poc = poc + 1
                    if poc > max_p and poc > 0:
                        if int(re_num.search(assigned_emails[j]).group(1)) <= poc:
                            max_p, max_j = poc, j
                same = int(re_num.search(assigned_emails[max_j]).group(1))
                if same < max_p and len_pr_names > 0:
                    assigned_emails[max_j] = str(max_p) + "|" + str(i)
                    i = 0
                else:
                    i = i + 1

            #Upravi prirazene emaily:
            re_num_vert = re.compile("\d+\|")
            for i in range(0, len(assigned_emails)):
                assigned_emails[i] = re_num_vert.sub("", assigned_emails[i])

        for i in range(0 , len(assigned_emails)):
            if assigned_emails[i] != "":
                ei = int(assigned_emails[i])
                _rel = RRSRelationshipContactPerson()
                _rel.set_entity(RRSContact(email=emails[ei]))
                names[i].set('contact', _rel)

        return names


    def extract_data(self, document, module=None, files=[], type=None):
        """
        Output of this method is RRSPublication object with extracted data.
        
        @param document: text form of a document
        @type document: str
        @param module: module name
        @type module: str
        @param files: list of files with the document
        @type files: [RRSFile]
        @param type: type of the document
        @type type: str  
        @return: document's metadata
        @rtype: RRSPublication
        """
    
        document = self.cleaner.clean_text(document)
        #document = str(unicode(document, errors='ignore').decode('UTF-8', 'ignore'))
        #document = document.translate(None, BAD_CHARS).replace("  ", " ")

        publication = RRSPublication()
        
        #Create  publication text
        rrs_text = RRSText(content=document, length=len(document))

        #Wrap document
        textual_document = self.document_wrapper.wrap(document)
        meta_text = textual_document.get_meta()

        #Store module information into publication
        publication.set('module', module)
        
        #Get and store publication language
        lang_data = self.lang_identifier.identify(meta_text)
        lang = RRSLanguage(name=lang_data[0])
        cred = int(lang_data[1] * 2)
        if cred > 100:cred = 100
        lang.set('credibility', cred)
        publication.set('language', lang)
        
        #Get files and store them into publication
        txt_file_path = None
        pdf_file_path = None
        for f in files:
            url = f.get('url')[0].get_entities()[0]
            if re.search('\.txt$', f.get("filename")) or (f.isset('type') and f.get('type') == "txt"):
                txt_file_path = url.get('link')
                rrs_text.set('file', f)
            elif re.search('\.pdf$', f.get("filename")) or (f.isset('type') and f.get('type') == "pdf"):
                pdf_file_path = url.get('link')
            _rel = RRSRelationshipFilePublication()
            _rel.set_entity(f)
            publication.set('file', _rel)
            
        publication.set('text', rrs_text)

        #Get publication type
        if type == None and txt_file_path != None:
            type = self.document_info.get_document_type(txt_file_path, pdf_file_path)
            publication.set('type', RRSPublication_type(type=type))
        elif type != None:
            publication.set('type', RRSPublication_type(type=type))

        #Get keywords and store them into publication
        keywords = self.meta_extractor.find_keywords(meta_text)
        for keyword in keywords[0]:
            _rel = RRSRelationshipPublicationKeyword()
            _rel.set_entity(keyword)
            publication.set('keyword', _rel)
            meta_text = keywords[1]

        #Get abstract and store it into publication
        abstract = self.meta_extractor.find_abstract(meta_text)
        publication.set('abstract', abstract[0])
        meta_text = abstract[1]

        #Get title from document and store it into publication
        title = self.meta_extractor.find_title(meta_text)
        publication.set('title', title[0])
        meta_text = title[1]

        #Get emails
        emails = self.email_extractor.get_emails(meta_text)
        meta_text = self.email_extractor.get_rest()

        #Get names and assign emails and store them into publication
        names = self.entity_extractor.find_authors(meta_text)
        assigned_names = self._assign_emails(emails, names[0])
        c = 0
        for name in assigned_names:
            c += 1
            _rel = RRSRelationshipPersonPublication(author_rank=c, editor=False)
            _rel.set_entity(name)
            publication.set('person', _rel)
            #publication.set('person', name)
        meta_text = names[1]


        #Get publisher from document and store it into publication
        publisher = self.entity_extractor.find_publisher(meta_text)
        publication.set('publisher', RRSOrganization(title=publisher[0]))
        meta_text = publisher[1]


        #Get chapters from document and store them into publication
        for chpt in textual_document.get_chapters():
            _rel = RRSRelationshipPublication_sectionPublication()
            _rel.set_entity(chpt)
            publication.set("publication_section", _rel)
        
        #Get citations from document and store them into publication
        for cit in textual_document.get_citations():
            if cit == None:
                continue
            _cit = self.cita_parser.extract(cit)
            if _cit.isset('reference'):
                _cit['reference']['publication'] = publication
            _rel = RRSRelationshipPublicationCitation()
            _rel.set_entity(_cit)
            publication.set("citation", _rel)

        return publication