Beispiel #1
0
	def createLemmaText(self):
		ct = CleanText()
		text = self.rawText
		text = text.lower()
		text = ct.removeStopWords(text, self.language)
		text = ct.removePunctuation(text)
		if self.language == 'EN':
			text = parseEN(text, tags = False, chunks = False, lemmata=True).split()
		elif self.language == 'FR':
			text = parseFR(text, tags = False, chunks = False, lemmata=True).split()
		try:
			if text:
				for word in text[0]:
					self.words.append((word[2].lower(), word[1][:2]))
				self.cleanText = ' '.join(word[0] for word in self.words)
		except Exception as e:
			print e, self.rawText
Beispiel #2
0
 def createLemmaText(self):
     ct = CleanText()
     text = self.rawText
     text = text.lower()
     text = ct.removeStopWords(text, self.language)
     text = ct.removePunctuation(text)
     if self.language == 'EN':
         text = parseEN(text, tags=False, chunks=False,
                        lemmata=True).split()
     elif self.language == 'FR':
         text = parseFR(text, tags=False, chunks=False,
                        lemmata=True).split()
     try:
         if text:
             for word in text[0]:
                 self.words.append((word[2].lower(), word[1][:2]))
             self.cleanText = ' '.join(word[0] for word in self.words)
     except Exception as e:
         print e, self.rawText
Beispiel #3
0
__copyright__ = "Copyright 2015, University Politehnica of Bucharest"
__license__ = "GNU GPL"
__version__ = "0.1"
__email__ = "*****@*****.**"
__status__ = "Production"

import pymongo
from gensim.utils import lemmatize
from nlplib.clean_text import CleanText
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
import time

client = pymongo.MongoClient()
db = client['Tectoniq']
cleanText = CleanText()

function = """function(){
				var items = db.search_index2.find().addOption(DBQuery.Option.noTimeout);
				while(items.hasNext()){
				var item = items.next();
					doc = {word: item._id, docIDs: item.value.docIDs};
					db.search_index.insert(doc);
				}
			}"""

mapFunction = """function() {
				var key = this.word;
				for (var idx=0; idx<this.docIDs.length; idx++){
					var tfidf = this.idf * this.docIDs[idx].tf;
					value = { 'docID': this.docIDs[idx].docID, 'TFIDF': tfidf };
Beispiel #4
0
 def __init__(self, dbname):
     self.dbname = dbname
     self.documents = []
     self.client = MongoClient()
     self.db = self.client[self.dbname]
     self.ct = CleanText()
Beispiel #5
0
class Parser(object):
    def __init__(self, dbname):
        self.dbname = dbname
        self.documents = []
        self.client = MongoClient()
        self.db = self.client[self.dbname]
        self.ct = CleanText()

    def images(self):
        directory = '../../DATA_SETS/IRHIS_BaseImages/'
        xml_files = [
            f for f in listdir(directory)
            if isfile(join(directory, f)) and f.endswith('.xml')
        ]

        for filename in xml_files:
            tree = ET.parse(join(directory, filename))
            root = tree.getroot()
            document = {}
            if root.attrib.get("record_id"):
                document["record_id"] = root.attrib.get("record_id").strip()
                for child in root.findall("description"):
                    document["title"] = self.ct.cleanText(
                        child.find("TitreEnregistrement").text, "FR")[0]
                    document["description"] = self.ct.cleanText(
                        child.find("Description").text, "FR")[0]
                    document["epoch"] = [child.find("EpoqueEvenement").text]
                    document["photo"] = child.find("CodePhoto").text
                    if child.find("ReferenceBibliographique") is not None:
                        document["reference"] = child.find(
                            "ReferenceBibliographique").text
                    if child.find("ProvenanceDocument") is not None:
                        document["source"] = child.find(
                            "ProvenanceDocument").text
                    if child.find("EtablissementDepositaire") is not None:
                        document["source_location"] = child.find(
                            "EtablissementDepositaire").text
                    if child.find("AnneeEvenement") is not None:
                        document["date"] = child.find("AnneeEvenement").text
                    words = self.getWords(document["description"])
                    if words:
                        document['words'] = words
                    l = set()
                    for a in child.findall("MotsClefsAnalytiques"):
                        l.add(a.text)
                    document["keywords"] = list(l)
                    l = set()
                    for a in child.findall("MotsClefsGeographiques"):
                        l.add(a.text)
                    document["location"] = list(l)
                self.documents.append(document)

    def insert(self):
        self.db.document.drop()
        if self.documents:
            self.db.documents.insert(self.documents)
        vocab = VI(self.dbname)
        vocab.createIndex()

    def getWords(self, text):
        lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR")
        lemmas.createLemmaText()
        lemmaText = lemmas.cleanText
        words = []
        if lemmaText and lemmaText != " ":
            lemmas.createLemmas()
            for w in lemmas.wordList:
                word = {}
                word['word'] = w.word
                word['tf'] = w.tf
                word['count'] = w.count
                word['pos'] = w.wtype
                words.append(word)
        return words

    def inventories(self):
        directory = '../../DATA_SETS/ServiceInventaire/'
        xml_files = [
            f for f in listdir(directory)
            if isfile(join(directory, f)) and f.endswith('.xml')
        ]
        for filename in xml_files:
            tree = ET.parse(join(directory, filename))
            root = tree.getroot()
            document = {}
            if root.attrib.get("reference"):
                document["record_id"] = root.attrib.get("reference").strip()
                document["title"] = self.ct.cleanText(
                    root.find("edifice").text, "FR")[0]
                document["description"] = self.ct.cleanText(
                    root.find("historique").text, "FR")[0] + ' ' + ' '.join(
                        self.ct.cleanText(root.find("historique").text,
                                          "FR")[0].split(';'))
                document["keywords"] = []
                if root.find("denomination").text is not None:
                    document["keywords"] += root.find(
                        "denomination").text.split(";")
                if root.find("grosOeuvres").text is not None:
                    document["keywords"] += root.find(
                        "grosOeuvres").text.split(";")
                if root.find("materiauxCouverture").text is not None:
                    document["keywords"] += root.find(
                        "materiauxCouverture").text.split(";")
                if root.find("couvrement").text is not None:
                    document["keywords"] += root.find("couvrement").text.split(
                        ";")
                document["epoch"] = root.find("epoqueConstruction").text.split(
                    ";")
                document["location"] = root.find("localisation").text.split(
                    ";")
                words = self.getWords(document["description"])
                if words:
                    document['words'] = words
                self.documents.append(document)

    def vdn(self):
        directory = '../../DATA_SETS/LaVoixDuNord/'
        xml_files = [
            f for f in listdir(directory)
            if isfile(join(directory, f)) and f.endswith('.xml')
        ]

        for filename in xml_files:
            tree = ET.parse(join(directory, filename))
            root = tree.getroot()

            for child in root.findall('DOCUMENT'):
                key = 'document_' + child.attrib.get("id")
                document = {}
                document['record_id'] = key
                document['source'] = child.find('DESCRIPTION').find(
                    'SOURCE').text
                document['author'] = child.find('DESCRIPTION').find(
                    'AUTEUR').text
                document['source_location'] = child.find('DESCRIPTION').find(
                    'REFERENCE').text
                document['date'] = child.find('DESCRIPTION').find('DATE').text
                document['title'] = self.ct.cleanText(
                    child.find('DESCRIPTION').find('TITRE').text, "FR")[0]
                document['description'] = self.ct.cleanText(
                    child.find('TEXTE').text, "FR")[0]
                words = self.getWords(document["description"])
                if words:
                    document['words'] = words
                self.documents.append(document)
Beispiel #6
0
                        #authors
                        authors = []
                        for a in elem[5].split(','):
                            author = dict()
                            names = a.split(' ')
                            author['firstname'] = ' '.join(names[:-1])
                            author['lastname'] = names[-1]
                            authors.append(author)
                        document['authors'] = authors
                        document['source'] = elem[6]
                        document['words'] = words
            except Exception as e:
                print e
        return document
"""
ct = CleanText()
def insert_data(dbname, corpus, remove=False):
    client = pymongo.MongoClient()
    db = client[dbname]
    if remove:
        db.documents.remove({})
    documents = []
    no_threads = cpu_count()
    with ProcessPoolExecutor(max_workers=no_threads) as worker:
        for result in worker.map(process_element, corpus):
            if result:
                documents.append(result)
    if documents:
        print len(documents)
        try:
            db.documents.insert(documents, continue_on_error=True)
Beispiel #7
0
 def __init__(self, dbname):
     self.dbname = dbname
     self.documents = []
     self.client = MongoClient()
     self.db = self.client[self.dbname]
     self.ct = CleanText()
Beispiel #8
0
class Parser(object):
    def __init__(self, dbname):
        self.dbname = dbname
        self.documents = []
        self.client = MongoClient()
        self.db = self.client[self.dbname]
        self.ct = CleanText()


    def images(self):
        directory = '../../DATA_SETS/IRHIS_BaseImages/'
        xml_files = [f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml')]
        
        for filename in xml_files:
            tree = ET.parse(join(directory, filename))
            root = tree.getroot()
            document = {}
            if root.attrib.get("record_id"):
                document["record_id"] = root.attrib.get("record_id").strip()
                for child in root.findall("description"):
                    document["title"] = self.ct.cleanText(child.find("TitreEnregistrement").text, "FR")[0]
                    document["description"] = self.ct.cleanText(child.find("Description").text, "FR")[0]
                    document["epoch"] = [child.find("EpoqueEvenement").text]
                    document["photo"] = child.find("CodePhoto").text
                    if child.find("ReferenceBibliographique") is not None:
                        document["reference"] = child.find("ReferenceBibliographique").text
                    if child.find("ProvenanceDocument") is not None:
                        document["source"] = child.find("ProvenanceDocument").text
                    if child.find("EtablissementDepositaire") is not None:
                        document["source_location"] = child.find("EtablissementDepositaire").text
                    if child.find("AnneeEvenement") is not None:
                        document["date"] = child.find("AnneeEvenement").text
                    words = self.getWords(document["description"])
                    if words:
                        document['words'] = words
                    l = set()
                    for a in child.findall("MotsClefsAnalytiques"):
                        l.add(a.text)
                    document["keywords"] = list(l)
                    l = set()
                    for a in child.findall("MotsClefsGeographiques"):
                        l.add(a.text)
                    document["location"] = list(l)
                self.documents.append(document)

    def insert(self):
        self.db.document.drop()
        if self.documents:
            self.db.documents.insert(self.documents)
        vocab = VI(self.dbname)
        vocab.createIndex()

           
    def getWords(self, text):
        lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR")
        lemmas.createLemmaText()
        lemmaText = lemmas.cleanText
        words = []
        if lemmaText and lemmaText != " ":
            lemmas.createLemmas()
            for w in lemmas.wordList:
                word = {}
                word['word']=w.word
                word['tf']=w.tf
                word['count']=w.count
                word['pos']=w.wtype
                words.append(word)
        return words

    def inventories(self):
        directory = '../../DATA_SETS/ServiceInventaire/'
        xml_files = [f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml')]
        for filename in xml_files:
            tree = ET.parse(join(directory, filename))
            root = tree.getroot()
            document = {} 
            if root.attrib.get("reference"):
                document["record_id"] = root.attrib.get("reference").strip()
                document["title"] = self.ct.cleanText(root.find("edifice").text, "FR")[0]
                document["description"] = self.ct.cleanText(root.find("historique").text, "FR")[0] + ' ' + ' '.join(self.ct.cleanText(root.find("historique").text, "FR")[0].split(';'))
                document["keywords"] = []
                if root.find("denomination").text is not None:
                    document["keywords"] += root.find("denomination").text.split(";")
                if root.find("grosOeuvres").text is not None:
                    document["keywords"] += root.find("grosOeuvres").text.split(";")
                if root.find("materiauxCouverture").text is not None:
                    document["keywords"] += root.find("materiauxCouverture").text.split(";")
                if root.find("couvrement").text is not None:
                    document["keywords"] += root.find("couvrement").text.split(";")
                document["epoch"] = root.find("epoqueConstruction").text.split(";")
                document["location"] = root.find("localisation").text.split(";")
                words = self.getWords(document["description"])
                if words:
                    document['words'] = words
                self.documents.append(document)
        
    def vdn(self):
        directory =  '../../DATA_SETS/LaVoixDuNord/'
        xml_files = [f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml')]

        for filename in xml_files:
            tree = ET.parse(join(directory, filename))
            root = tree.getroot()

            for child in root.findall('DOCUMENT'):
                key = 'document_' + child.attrib.get("id")
                document = {}
                document['record_id'] = key
                document['source'] = child.find('DESCRIPTION').find('SOURCE').text
                document['author'] = child.find('DESCRIPTION').find('AUTEUR').text
                document['source_location'] = child.find('DESCRIPTION').find('REFERENCE').text
                document['date'] = child.find('DESCRIPTION').find('DATE').text
                document['title'] = self.ct.cleanText(child.find('DESCRIPTION').find('TITRE').text, "FR")[0]
                document['description'] = self.ct.cleanText(child.find('TEXTE').text, "FR")[0]
                words = self.getWords(document["description"])
                if words:
                    document['words'] = words
                self.documents.append(document)