Ejemplo n.º 1
0
 def get_words(self, txt):
     """ Get words w/o stopwords. Returns a list of words"""
     dico=self.extract_dictionary(txt)
     if dico is not None : 
         words=[any2utf8(w) for w in dico if any2utf8(w) not in self.stopwords]
     else :
         words = []
     return words
Ejemplo n.º 2
0
def extract_keywords(txt):
    """ Extract keywords from FR text"""
    blob = TextBlob(any2utf8(txt),
                    pos_tagger=PatternTagger(),
                    analyzer=PatternAnalyzer())
    tags = blob.tags
    return [tag for tag in tags]
Ejemplo n.º 3
0
def extract_dictionary(txt):
    """ Extract from FR text"""
    blob = TextBlob(any2utf8(txt),
                    pos_tagger=PatternTagger(),
                    analyzer=PatternAnalyzer())
    seg_list = blob.words
    return list(seg_list)
Ejemplo n.º 4
0
    def put(self, id):

        form = DatasetUpdateForm()
        if not form.validate_on_submit():
            return form.errors, 422

        # check if the record exists in the DB
        dataset = Dataset.query.filter_by(id=id).first()
        if dataset is None: return 404

        # check rights
        if dataset.user.id != current_user.id : return 401

        if len(form.additional_columns.data) :
            additional_columns =  any2utf8(form.additional_columns.data)
        else :
            additional_columns = None

        # validate values
        csv_corpus = CSVCorpus(dataset.filepath,
                                source_column=form.source_column.data,
                                text_column=form.text_column.data,
                                timestamp_column=form.time_column.data,
                                time_pattern=form.time_pattern.data,
                                additional_columns=additional_columns)

        try :
            csv_corpus.validate()
        except ValueError, e:
            return e.message, 422
Ejemplo n.º 5
0
def csv2elastic(dataset):

    logger.info("loading csv file")

    if dataset["additional_columns"] : additional_columns = any2utf8(dataset["additional_columns"])
    else : additional_columns = dataset["additional_columns"]

    # open the corpus
    csv_corpus = CSVCorpus(dataset["filepath"], 
                            timestamp_column = dataset["time_column"], 
                            time_pattern= dataset["time_pattern"], 
                            text_column=dataset["text_column"], 
                            source_column= dataset["source_column"],
                            additional_columns=additional_columns )

    # ensure that index exists
    # get_index_info(dataset["index_name"])

    d = Dataset.query.filter_by(id=dataset["id"]).first()
    d.index_state = "processing"
    db.session.commit()

    for i, row in enumerate(csv_corpus) :
        # if i%10 == 0: 
        #     # print "emit socket"
        #     socket.emit("progress", json.dumps({"count" : i}))

        res = elastic.index(dataset["index_name"], "message", row)

    # change the state to done
    d.index_state = "done"
    db.session.commit()

    return res
Ejemplo n.º 6
0
    def __init__(self,
                 fname,
                 timestamp="created_at",
                 time_pattern="%Y-%m-%dT%H:%M:%S",
                 content="text",
                 origin="user_id",
                 adds=[]):
        """
        Initialize the corpus from a file.
        """

        logger.info("loading corpus from %s" % fname)
        self.fname = fname
        self.length = None
        self.timestamp = timestamp
        self.time_pattern = time_pattern
        self.content = content
        self.origin = origin
        self.length = 0

        if adds is None:
            self.adds = []
        elif type(adds) is str:
            self.adds = adds.split(",")
        elif type(adds) is list:
            self.adds = adds
        elif type(adds) is unicode:
            self.adds = any2utf8(adds).split(",")
        else:
            raise TypeError("Wrong type for 'adds")

        # load the first few lines, to guess the CSV dialect
        head = ''.join(itertools.islice(open(self.fname, "r"), 5))
        self.has_headers = csv.Sniffer().has_header(head)
        self.dialect = csv.Sniffer().sniff(head)
        logger.info("sniffed CSV delimiter=%r, headers=%s" %
                    (self.dialect.delimiter, self.has_headers))

        # test encoding
        encoding = chardet.detect(head)
        self.encoding = encoding['encoding']

        if encoding['confidence'] < 0.99 or encoding['encoding'] != 'utf-8':
            raise TypeError(
                "File has an unknown encoding : %s. Please try UTF-8 for better compatibility"
                % encoding['encoding'])

        logger.info("encoding detected as %s" % (encoding["encoding"]))
        self.reader = csv.DictReader(open(self.fname, "r"),
                                     dialect=self.dialect)

        # headers are required
        if not self.has_headers:
            raise KeyError("CSV file should have headers")

        # store headers
        self.headers = self.reader.fieldnames
Ejemplo n.º 7
0
    def validateCSV(self):
        """
            Perform several checks on CSV files

            * file should have headers
            * columns should exist
            * timestamp format should be valid 
            
            """
        # headers are required
        if not self.has_headers:
            raise KeyError("CSV file should have headers")

        # check if required columns exist
        if any2utf8(self.timestamp) not in self.headers:
            raise ValueError("Time column '%s' not present in CSV" %
                             self.timestamp)
        if any2utf8(self.content) not in self.headers:
            raise ValueError("Text column '%s' not present in CSV" %
                             self.content)
        if any2utf8(self.origin) not in self.headers:
            raise ValueError("Author column '%s' not present in CSV" %
                             self.origin)

        for column_name in self.adds:
            if any2utf8(column_name) not in self.headers:
                raise ValueError("Column '%s' not present in CSV" %
                                 column_name)

        # check time format (will raise ValueError)
        first_line = self.reader.next()
        timestamp = first_line[any2utf8(self.timestamp)]
        datetime.strptime(timestamp, any2utf8(self.time_pattern))
Ejemplo n.º 8
0
    def validate(self):
            """
            Perform several checks on CSV files

            * file should have headers
            * columns should exist
            * timestamp format should be valid 
            
            """
            # headers are required
            if not self.has_headers :
                raise KeyError("CSV file should have headers")

            # check if required columns exist
            if any2utf8(self.timestamp_column) not in self.headers: 
                raise ValueError("Time column '%s' not present in CSV"%self.timestamp_column)
            if any2utf8(self.text_column) not in self.headers:
                raise ValueError("Text column '%s' not present in CSV"%self.text_column)
            if any2utf8(self.source_column) not in self.headers:
                raise ValueError("Author column '%s' not present in CSV"%self.source_column)

            for column_name in self.additional_columns : 
                if any2utf8(column_name) not in self.headers:
                    raise ValueError("Column '%s' not present in CSV"%column_name)

            # check time format (will raise ValueError)
            first_line = self.reader.next()
            timestamp = first_line[any2utf8(self.timestamp_column)]
            datetime.strptime(timestamp, any2utf8(self.time_pattern))
Ejemplo n.º 9
0
def build_query(q, stopwords):
    # build query
    query = {}
    query_and = []

    if q is not None :
        query_and.append({"keywords" : { "$in" : [ any2utf8(q)  ] }})

    if stopwords is not None :
        query_and.append({"keywords" : { "$nin" : stopwords }})

    if stopwords is not None or q is not None :
         query = {"$and": query_and }

    return query
Ejemplo n.º 10
0
    def __init__(self, fname, timestamp_column="created_at", time_pattern="%Y-%m-%dT%H:%M:%S", text_column="text", source_column="user_id", additional_columns = []):
        """
        Initialize the corpus from a file.
        """

        logger.info("loading corpus from %s" % fname)
        self.fname = fname
        self.length = None
        self.timestamp_column = timestamp_column
        self.time_pattern = time_pattern
        self.text_column = text_column
        self.source_column = source_column
        self.length = 0

        if additional_columns is None :
            self.additional_columns =[]
        elif type(additional_columns) is str : 
            self.additional_columns = additional_columns.split(",")
        elif type(additional_columns) is list :
            self.additional_columns = additional_columns
        elif type(additional_columns) is unicode :
            self.additional_columns = any2utf8(additional_columns).split(",")
        else :
            raise TypeError("Wrong type for 'additional_columns")

        # load the first few lines, to guess the CSV dialect
        head = ''.join(itertools.islice(open(self.fname, "r"), 5))
        self.has_headers = csv.Sniffer().has_header(head)
        self.dialect = csv.Sniffer().sniff(head)
        logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.has_headers))

        # test encoding
        encoding = chardet.detect(head)
        self.encoding  = encoding['encoding']

        if encoding['confidence'] <  0.99 or encoding['encoding'] != 'utf-8': 
            raise TypeError("File has an unknown encoding : %s. Please try UTF-8 for better compatibility"% encoding['encoding'])

        logger.info("encoding detected as %s" % (encoding["encoding"]))
        self.reader = csv.DictReader(open(self.fname, "r"), dialect=self.dialect)

        # headers are required
        if not self.has_headers :
            raise KeyError("CSV file should have headers")

        # store headers
        self.headers = self.reader.fieldnames
Ejemplo n.º 11
0
    def __iter__(self):
        """
        Iterate over the corpus, returning a tuple with text as a 'str' and timestamp as a 'datetime' object.
        """ 
        for index, row in enumerate(self.reader, start=1):
            result = {}

            result["text_column"] = any2utf8(row[any2utf8(self.text_column)])
            result["time_column"] = datetime.strptime(row[any2utf8(self.timestamp_column)], self.time_pattern)
            result["source_column"] = row[any2utf8(self.source_column)]

            for column_name in self.additional_columns :
                result[any2utf8(column_name)] = any2utf8(row[any2utf8(column_name)])

            self.length =  self.length + 1  # store the total number of CSV rows

            yield(result)
Ejemplo n.º 12
0
def get_topogram(dataset):

    if dataset["additional_columns"] : additional_columns = any2utf8(dataset["additional_columns"])
    else : additional_columns = dataset["additional_columns"]

    # open the corpus
    csv_corpus = CSVCorpus(dataset["filepath"],
                            timestamp_column = dataset["time_column"],
                            time_pattern= dataset["time_pattern"],
                            text_column=dataset["text_column"],
                            source_column= dataset["source_column"],
                            additional_columns=additional_columns )
    # init NLP
    nlp = ChineseNLP()

    # start processing  data
    topogram = NLPPreProcess(corpus=csv_corpus, nlp=nlp)
    print dataset["index_name"]

    return topogram
Ejemplo n.º 13
0
    def __iter__(self):
        """
        Iterate over the corpus, returning a tuple with text as a 'str' and timestamp as a 'datetime' object.
        """
        for index, row in enumerate(self.reader, start=1):
            result = {}

            result["content"] = any2utf8(row[any2utf8(self.content)])
            result["timestamp"] = datetime.strptime(
                row[any2utf8(self.timestamp)], self.time_pattern)
            result["origin"] = row[any2utf8(self.origin)]

            for column_name in self.adds:
                result[any2utf8(column_name)] = any2utf8(
                    row[any2utf8(column_name)])

            self.length = self.length + 1  # store the total number of CSV rows

            yield (result)
Ejemplo n.º 14
0
def csv2elastic(dataset):

    logger.info("loading csv file")

    if dataset["additional_columns"]:
        additional_columns = any2utf8(dataset["additional_columns"])
    else:
        additional_columns = dataset["additional_columns"]

    # open the corpus
    csv_corpus = CSVCorpus(dataset["filepath"],
                           timestamp_column=dataset["time_column"],
                           time_pattern=dataset["time_pattern"],
                           text_column=dataset["text_column"],
                           source_column=dataset["source_column"],
                           additional_columns=additional_columns)

    # ensure that index exists
    # get_index_info(dataset["index_name"])

    d = Dataset.query.filter_by(id=dataset["id"]).first()
    d.index_state = "processing"
    db.session.commit()

    for i, row in enumerate(csv_corpus):
        # if i%10 == 0:
        #     # print "emit socket"
        #     socket.emit("progress", json.dumps({"count" : i}))

        res = elastic.index(dataset["index_name"], "message", row)

    # change the state to done
    d.index_state = "done"
    db.session.commit()

    return res
Ejemplo n.º 15
0
def extract_keywords(txt):
    """ Extract keywords from Chinese text""" 
    tags = jieba.analyse.extract_tags(txt, 20)
    return [ any2utf8(tag) for tag in tags]
Ejemplo n.º 16
0
def extract_keywords(txt):
    """ Extract keywords from FR text""" 
    blob = TextBlob(any2utf8(txt), pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    tags = blob.tags
    return [tag for tag in tags]
Ejemplo n.º 17
0
 def test_any2utf8(self):
     s = "你好".decode('utf-8').encode('utf-8')
     utf = any2utf8(s)
     self.assertTrue(isinstance(utf, str))
Ejemplo n.º 18
0
 def test_any2utf8(self):
     s = "你好".decode('utf-8').encode('utf-8')
     utf = any2utf8(s)
     self.assertTrue(isinstance(utf, str))
Ejemplo n.º 19
0
def extract_dictionary(txt):
    """ Extract from FR text"""
    blob = TextBlob(any2utf8(txt), pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    seg_list = blob.words  
    return list(seg_list)
Ejemplo n.º 20
0
def extract_keywords(txt):
    """ Extract keywords from Chinese text"""
    tags = jieba.analyse.extract_tags(txt, 20)
    return [any2utf8(tag) for tag in tags]