Example #1
0
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"):
    """
    Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``.

    Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values.
    """

    url = os.path.basename(infile)
    # Get the current date and time (UTC)
    today = datetime.datetime.utcnow().strftime("%Y-%m-%d")

    if dataset_name is None:
        dataset_name = url

    if encoding is None:
        detector = UniversalDetector()
        with open(infile, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'],
                                                                   detector.result['confidence']))

    if delimiter is None:
        try: #Python 3
            with open(infile, 'r', errors='ignore') as csvfile:
                # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
                dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
                csvfile.seek(0)
        except TypeError: #Python 2
            with open(infile, 'r') as csvfile:
                # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
                dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
                csvfile.seek(0)
        logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter))
        delimiter = dialect.delimiter


    logger.info("Delimiter is: {}".format(delimiter))

    if base.endswith('/'):
        base = base[:-1]

    metadata = {
        u"@id": iribaker.to_iri(u"{}/{}".format(base, url)),
        u"@context": [u"https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json",
                     {u"@language": u"en",
                      u"@base": u"{}/".format(base)},
                     get_namespaces(base)],
        u"url": url,
        u"dialect": {u"delimiter": delimiter,
                    u"encoding": encoding,
                    u"quoteChar": quotechar
                    },
        u"dc:title": dataset_name,
        u"dcat:keyword": [],
        u"dc:publisher": {
            u"schema:name": u"CLARIAH Structured Data Hub - Datalegend",
            u"schema:url": {u"@id": u"http://datalegend.net"}
        },
        u"dc:license": {u"@id": u"http://opendefinition.org/licenses/cc-by/"},
        u"dc:modified": {u"@value": today, u"@type": u"xsd:date"},
        u"tableSchema": {
            u"columns": [],
            u"primaryKey": None,
            u"aboutUrl": u"{_row}"
        }
    }

    with io.open(infile, 'rb') as infile_file:
        r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar)

        try:
            # Python 2
            header = r.next()
        except AttributeError:
            # Python 3
            header = next(r)

        logger.info(u"Found headers: {}".format(header))

        if u'' in header:
            logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse")
        if len(set(header)) < len(header):
            logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse")

        # First column is primary key
        metadata[u'tableSchema'][u'primaryKey'] = header[0]

        for head in header:
            col = {
                u"@id": iribaker.to_iri(u"{}/{}/column/{}".format(base, url, head)),
                u"name": head,
                u"titles": [head],
                u"dc:description": head,
                u"datatype": u"string"
            }

            metadata[u'tableSchema'][u'columns'].append(col)

    with open(outfile, 'w') as outfile_file:
        outfile_file.write(json.dumps(metadata, indent=True))

    logger.info("Done")
    return
Example #2
0
    def create_search_index(self):
        search_index = set()

        detector = UniversalDetector()

        ### Код проекта
        try:
            search_index.add(self.proj_kod.decode("utf-8"))
        except:
            search_index.add(self.proj_kod)

        ## Название проекта
        for w in self.proj_name.split():
            try:
                search_index.add(w.decode("utf-8"))
            except:
                search_index.add(w)

        ### Инициатор проекта
        if self.proj_init:
            try:
                search_index.add(self.proj_init.name.decode("utf-8"))
            except:
                search_index.add(self.proj_init.name)

        ### Реализатор проекта
        if self.executor:
            try:
                search_index.add(self.executor.name.decode("utf-8"))
            except:
                search_index.add(self.executor.name)

        ### Этапы
        if self.stage:
            try:
                search_index.add(self.stage.getfullname().decode("utf-8"))
            except:
                search_index.add(self.stage.getfullname())

        ### Адрес
        if self.data.has_key('address'):
            for addr in self.data["address"]:
                try:
                    search_index.add(addr["city"].decode("utf-8"))
                except:
                    search_index.add(addr["city"])
                try:
                    search_index.add(addr["street"].decode("utf-8"))
                except:
                    search_index.add(addr["street"])

        ### Контрагент
        for w in self.contragent.split():
            try:
                search_index.add(w.decode("utf-8"))
            except:
                search_index.add(w)

        ### Исполнители
        for ex in self.reestr_proj_exec_date_set.all():
            if ex.worker:
                for w in ex.worker.get_full_name().split():
                    try:
                        search_index.add(w.decode("utf-8"))
                    except:
                        search_index.add(w)

        ### Связь с другими системами
        if self.data.has_key('other_system'):
            for code in self.data['other_system']:
                try:
                    search_index.add(code['other_name'].decode("utf-8"))
                except:
                    search_index.add(code['other_name'])
                try:
                    search_index.add(code['other_code'].decode("utf-8"))
                except:
                    search_index.add(code['other_code'])

        detector.close()

        self.search_index = u"".join(list(search_index))
        self.save()

        return "ok"
Example #3
0
    def load_file(self, f_path):
        """
        Load data from a CSV file to the workspace.
        Column 0 is used for the index column.
        chardet attempts to determine encoding if file is not utf-8.
            # Attributes
                f_path(String): The filename selected via open_file
        """
        # FIXME: Reset status bar when new data is loaded.
        try:
            self.full_data = pd.read_csv(f_path, encoding='utf-8', index_col=0)
        except UnicodeDecodeError as ude:
            self.logger.warning("UnicodeDecode error opening file",
                                exc_info=True)
            self.comms.update_statusbar.emit(
                "Attempting to determine file encoding...")
            detector = UniversalDetector()
            try:
                for line in open(f_path, 'rb'):
                    detector.feed(line)
                    if detector.done:
                        break
                detector.close()
                print("chardet determined encoding type to be {}".format(
                    detector.result['encoding']))
                self.full_data = pd.read_csv(
                    f_path, encoding=detector.result['encoding'], index_col=0)
            except Exception as e:
                self.logger.error("Error detecing encoding", exc_info=True)
                exceptionWarning("Exception has occured.", exception=e)
        except IOError as ioe:
            self.logger.error("IOError detecting encoding", exc_info=True)
            exceptionWarning("IO Exception occured while opening file.",
                             exception=ioe)
        except Exception as e:
            self.logger.error("Error detecting encoding", exc_info=True)
            exceptionWarning("Error occured opening file.", exception=e)

        try:
            columns = self.full_data.columns
            self.available_columns = []

            for column in columns:
                if column.endswith("text"):
                    self.available_columns.append(column)
            if self.available_columns:
                self.available_column_model.loadData(self.available_columns,
                                                     include_labels=False)

                self.available_column_model.setAllowableData(
                    self.allowable_columns)
                # drop_cols = [col for col in self.full_data.columns if col not in self.available_columns ]
                # self.full_data.drop(drop_cols, axis=1, inplace=True)
                # print("full_data columns: ", self.full_data.columns)
                self.full_text_count.setText(str(self.full_data.shape[0]))
                # self.display_selected_row(None)
                self.select_all_btn.setEnabled(True)
                self.deselect_all_btn.setEnabled(True)

                self.comms.update_statusbar.emit("CSV loaded.")
            else:
                exceptionWarning("No allowable data discovered in file.")
        except pd.errors.EmptyDataError as ede:
            exceptionWarning('Empty Data Error.\n', exception=ede)
        except Exception as e:
            self.logger.error("Error loading dataframe", exc_info=True)
            exceptionWarning("Exception occured.  PredictWidget.load_file.",
                             exception=e)
Example #4
0
 def load_file(self, f_path):
     """
     Load data from a CSV file to the workspace.
     Column 0 is used for the index column.
     chardet attempts to determine encoding if file is not utf-8.
         # Attributes
             f_path(String): The filename selected via open_file
     """
     # FIXME: Reset status bar when new data is loaded.
     try:
         self.update_progressbar.emit(0, True)
         self.available_column_model.loadData([])
         self.select_all_btn.setEnabled(False)
         self.deselect_all_btn.setEnabled(False)
         self.full_data = pd.read_csv(f_path, encoding='utf-8', index_col=0, sep=None)
     except UnicodeDecodeError as ude:
         self.logger.warning(
             "UnicodeDecode error opening file", exc_info=True)
         print("UnicodeDecodeError caught.  File is not UTF-8 encoded. \
                Attempting to determine file encoding...")
         self.update_statusbar.emit(
             "File is not UTF-8 encoded. Attempting to determine file encoding...")
         detector = UniversalDetector()
         try:
             for line in open(f_path, 'rb'):
                 detector.feed(line)
                 if detector.done:
                     break
             detector.close()
             self.update_statusbar.emit("Chardet determined encoding type to be {}".format(
                 detector.result['encoding']))
             self.logger.info("Chardet determined encoding type to be {}".format(
                 detector.result['encoding']))
             self.full_data = pd.read_csv(
                 f_path, encoding=detector.result['encoding'], index_col=0)
         except Exception as e:
             self.logger.error("Error detecting encoding", exc_info=True)
             exceptionWarning("Exception has occured.", exception=e)
     except IOError as ioe:
         self.logger.error("IOError detecting encoding", exc_info=True)
         exceptionWarning(
             "IO Exception occured while opening file.", exception=ioe)
     except Exception as e:
         self.logger.error("Error detecting encoding", exc_info=True)
         exceptionWarning("Error occured opening file.", exception=e)
     #TODO: clean up dataset by removing NA for values or index
     try:
         columns = self.full_data.columns
         self.available_columns = []
         for column in columns:
             if column.endswith(DATA_COLUMN_SUFFIX):
                 label_col = column.split(TAG_DELIMITER)[0] + TRUTH_SUFFIX 
                 if label_col in columns:
                     self.available_columns.append(column)
                     self.available_columns.append(label_col)
         # If no data found, the model will be reset.
         if(self.available_columns):
             self.available_column_model.loadData(self.available_columns)
             self.full_text_count.setText(str(self.full_data.shape[0]))
             self.display_selected_rows(None)
             self.update_statusbar.emit("CSV loaded.")
             self.select_all_btn.setEnabled(True)
             self.deselect_all_btn.setEnabled(True)
         else:
             exceptionWarning(f"No usable data found in {f_path}")
             self.logger.info(f"No usable data found in {f_path}")
             self.update_statusbar.emit("No usable data found in file")
         self.available_column_model.setCheckboxes(False)
         self.load_selected_data()
     except pd.errors.EmptyDataError as ede:
         exceptionWarning(
             exceptionTitle='Empty Data Error.\n', exception=ede)
     except Exception as e:
         self.logger.error("Error loading dataframe", exc_info=True)
         exceptionWarning(
             "Exception occured.  DataLoader.load_file.", exception=e)
         tb = traceback.format_exc()
         print(tb)
     finally:
         self.update_progressbar.emit(0, False)
Example #5
0
 def __init__(self, fileName):
     self.fileName = fileName
     self.detector = UniversalDetector()
Example #6
0
def detect_file_encoding(file_path, buffer_size=1024, max_lines=20):
    """
    Determine encoding of files within initial `max_lines` of length `buffer_size`.

    :param file_path: path to the file
    :type file_path: str
    :param buffer_size: buffer length for each line being read
    :type buffer_size: int
    :param max_lines: number of lines to read from file of length buffer_size
    :type max_lines: int
    :return: encoding type
    :rtype: str
    """
    detector = UniversalDetector()
    line_count = 0
    with FileOrBufferHandler(file_path, "rb") as input_file:
        chunk = input_file.read(buffer_size)
        while chunk and line_count < max_lines:
            detector.feed(chunk)
            chunk = input_file.read(buffer_size)
            line_count += 1
    detector.close()
    encoding = detector.result["encoding"]

    # Typical file representation is utf-8 instead of ascii, treat as such.
    if not encoding or encoding.lower() in ["ascii", "windows-1254"]:
        encoding = "utf-8"

    # Check if encoding can be used to decode without throwing an error
    def _decode_is_valid(encoding):
        try:
            with FileOrBufferHandler(file_path,
                                     encoding=encoding) as input_file:
                input_file.read(1024 * 1024)
                return True
        except Exception:
            return False

    if not _decode_is_valid(encoding):
        try:
            from charset_normalizer import CharsetNormalizerMatches as CnM

            # Try with small sample
            with FileOrBufferHandler(file_path, "rb") as input_file:
                raw_data = input_file.read(10000)
                result = CnM.from_bytes(
                    raw_data,
                    steps=5,
                    chunk_size=512,
                    threshold=0.2,
                    cp_isolation=None,
                    cp_exclusion=None,
                    preemptive_behaviour=True,
                    explain=False,
                )
                result = result.best()
            if result:
                if result.first():
                    encoding = result.first().encoding

            # Try again with full sample
            if not _decode_is_valid(encoding):
                with FileOrBufferHandler(file_path, "rb") as input_file:
                    raw_data = input_file.read(max_lines * buffer_size)
                    result = CnM.from_bytes(
                        raw_data,
                        steps=max_lines,
                        chunk_size=buffer_size,
                        threshold=0.2,
                        cp_isolation=None,
                        cp_exclusion=None,
                        preemptive_behaviour=True,
                        explain=False,
                    )
                    result = result.best()
                if result:
                    if result.first():
                        encoding = result.first().encoding

        except Exception:
            logger.info("Install charset_normalizer for improved file "
                        "encoding detection")

    # If no encoding is still found, default to utf-8
    if not encoding:
        encoding = "utf-8"
    return encoding.lower()
Example #7
0
 def set_source(self, name):
     # source _dependent_ initialization goes here
     if name is None or not os.path.isfile(name):
         return False
     IP.set_source(self, name)
     self.__source_name = name
     # auto-detect file-encoding (optional)
     try:
         from chardet.universaldetector import UniversalDetector
         detector = UniversalDetector()
         detector.reset()
         lines = 0
         for line in file(self.__source_name, 'rb'):
             detector.feed(line)
             lines += 1
             if detector.done or lines == 50:
                 break
         detector.close()
         encoding = string.lower(detector.result['encoding'])
     except:
         log.exception('')
         encoding = 'utf_8'
     encoding = self._encoding_cleanup.sub('', encoding)
     model = self.gtk.get_widget('e_encoding').get_model()
     itempos = 0
     for item in model:
         pos1 = string.find(
             self._encoding_cleanup.sub('', string.lower(str(item[0]))),
             encoding)
         if pos1 == 0:
             break
         itempos += 1
     self.gtk.get_widget('e_encoding').set_active(itempos)
     # auto-detect CSV import settings (optional)
     try:
         import csv
         sniffer = csv.Sniffer()
         csvfilesize = os.path.getsize(self.__source_name)
         if csvfilesize > 65535:
             csvfilesize = 65535
         csvfile = file(self.__source_name, 'rb')
         try:
             # quote char, line terminator and field delimiter
             proposed_dialect = sniffer.sniff(csvfile.read(csvfilesize))
             self.gtk.get_widget('e_delimiter').set_text(
                 proposed_dialect.delimiter)
             self.gtk.get_widget('e_quotechar').set_text(
                 proposed_dialect.quotechar)
             if proposed_dialect.lineterminator == '\r\n':
                 self.gtk.get_widget('e_lineterminator').set_active(1)
             # first row with column headers
             csvfile.seek(0)
             if sniffer.has_header(csvfile.read(csvfilesize)):
                 self.gtk.get_widget('e_startrow').set_text('1')
             else:
                 self.gtk.get_widget('e_startrow').set_text('0')
         finally:
             csvfile.close()
     except:
         log.exception('')
     # run dialog
     response = self.gtk.get_widget('d_import').run()
     if response == gtk.RESPONSE_OK:
         return True
     else:
         return False
Example #8
0
# 3.2 读取全文来判断文件编码
f_temp = open('测试文件.txt', 'rb')  # 测试文件.txt是一个WIN7系统下GB2312编码的文本文档
f_content = f_temp.read()
enc = chardet.detect(f_content)
f_temp.close()
result = '测试文件.txt' + " 信息如下>>\n\t"+"文件编码语言:" + enc['language']+"\n\t文件编码:"+enc['encoding']+"\n\t结果可信度:"+ str(enc['confidence'])
print(result)
# 测试文件.txt 信息如下>>	文件编码语言:Chinese	文件编码:GB2312	结果可信度:0.99


# 3.3 高级应用,当用于检测的文档特别大时,可以chardet的子模块chardet.universaldetector。
# 这个模块允许我们分多次(逐行读取或者自行断行读取)检测文本的编码格式,当达到一定的阈值时便可以提前退出检测。
# 这样做可以有效地节省资源,提高程序效率,同时保证检测结果的准确性。

detector = UniversalDetector()  # 初始化一个UniversalDetector对象
detector.reset()  # 清除上次的检测结果
f = open('测试文件.txt', 'rb')  # 测试文件.txt是一个WIN7系统下GB2312编码的文本文档

for line in f:
    detector.feed(line)  # 逐行载入UniversalDetector对象中进行识别
    if detector.done:  # done为一个布尔值,默认为False,达到阈值时变为True
        break

detector.close()  # 调用该函数做最后的数据整合
f.close()
print(detector.result)
# {'language': 'Chinese', 'encoding': 'GB2312', 'confidence': 0.99}


Example #9
0
def filelist(root):
    wb = px.load_workbook('./filelist.xlsx')  #書き込み対象のExcel
    ws = wb.active
    START_COL = 4  #開始行
    ROW_C = 67  #C列
    ROW_M = 77  #M列
    ROW_V = 86  #V列
    ID_LIST = [65, 1, 1, 1, 1, 1, 1, 1, 1, 1]  #65 = A
    PREV_COUNTER = 0
    PREV_DIRECTORY = ''

    ## 現在のディレクトリを再帰的に検索
    for dirpath, dirname, filename in os.walk(root):
        for FILENAME in filename:
            ## 拡張子「.html」or「.php」のファイルに絞る
            if fnmatch.fnmatch(FILENAME, '*.html') or fnmatch.fnmatch(
                    FILENAME, '*.php'):
                ## 対象ファイルパス
                html = os.path.join(dirpath, FILENAME)

                ## 対象ファイルの文字コード判定
                detector = UniversalDetector()
                with open(html, mode='rb') as f:
                    for binary in f:
                        detector.feed(binary)
                        if detector.done:
                            break
                detector.close()

                ## ルートパスは不要なため置換
                PATH = html.replace(root, '')
                DIRECTORY = dirpath.replace(root, '') + '/'
                print(PATH)

                ## html情報を取得
                try:
                    soup = BeautifulSoup(
                        open(html, encoding=detector.result['encoding']),
                        "lxml")

                    ## title情報を取得
                    HEAD = soup.find("head")
                    TITLE_CONTENT = HEAD.find("title")
                    if TITLE_CONTENT != None:
                        TITLE = TITLE_CONTENT.text
                    else:
                        ## titleが無い場合は「null」をセット
                        TITLE = 'null'

                    ## keywords情報を取得
                    META_KEYWORDS = HEAD.find('meta',
                                              attrs={'name': 'keywords'})
                    if META_KEYWORDS != None:
                        KEYWORDS = META_KEYWORDS.attrs['content']
                    else:
                        ## keywordsが無い場合は「null」をセット
                        KEYWORDS = 'null'

                    ## description情報を取得
                    META_DESCRIPTION = soup.find('meta',
                                                 attrs={'name': 'description'})
                    if META_DESCRIPTION != None:
                        DESCRIPTION = META_DESCRIPTION.attrs['content']
                    else:
                        ## descriptionが無い場合は「null」をセット
                        DESCRIPTION = 'null'

                ## html情報が取得出来なかった場合
                except:
                    TITLE = 'エラー'
                    KEYWORDS = ''
                    DESCRIPTION = ''

                ## 階層を取得
                COUNTER = Counter(PATH)

                ## ページIDをセット
                if PREV_COUNTER >= COUNTER['/']:
                    ID_LIST[(COUNTER['/'] -
                             1)] = ID_LIST[(COUNTER['/'] - 1)] + 1
                    for i in range(COUNTER['/'], 6):
                        ID_LIST[i] = 1

                if COUNTER['/'] == 2:
                    if PREV_DIRECTORY != DIRECTORY:
                        ID_LIST[0] = ID_LIST[0] + 1
                        ID_LIST[1] = 1

                ## 1つ前のページ情報を保存
                PREV_COUNTER = COUNTER['/']
                PREV_DIRECTORY = DIRECTORY

                ## ページIDを記載
                num = 0
                if COUNTER['/'] == 1:
                    ROW_ID = 2
                else:
                    ROW_ID = COUNTER['/']

                for i in range(ROW_C, ROW_C + ROW_ID):
                    if i == ROW_C:
                        ws[str(chr(i)) + str(START_COL)].value = str(
                            chr(ID_LIST[0]))
                    else:
                        ws[str(chr(i)) + str(START_COL)].value = ID_LIST[num]
                    num = num + 1

                ## タイトルを記載
                for i in range(ROW_M, ROW_V):
                    if i == ROW_M + (COUNTER['/'] - 1):
                        ws[str(chr(i)) + str(START_COL)].value = TITLE

                    ## 自階層の列より左の場合
                    if i - ROW_M < (COUNTER['/'] - 1):
                        ws[str(chr(i)) + str(START_COL)].border = Border(
                            left=Side(style='thin', color='000000'), )
                    ## 自階層の列の場合
                    elif i - ROW_M == (COUNTER['/'] - 1):
                        ws[str(chr(i)) + str(START_COL)].border = Border(
                            top=Side(style='thin', color='000000'),
                            left=Side(style='thin', color='000000'),
                        )
                    ## 自階層の列より右の場合
                    else:
                        ws[str(chr(i)) + str(START_COL)].border = Border(
                            top=Side(style='thin', color='000000'),
                            bottom=Side(style='thin', color='000000'),
                        )

                ## 各ページ情報を記載
                # ws['R'+str(START_COL)].value = DIRECTORY      #ディレクトリ
                # ws['S'+str(START_COL)].value = FILENAME       #ファイル名
                ws['W' + str(START_COL)].value = PATH  #パス
                ws['X' + str(START_COL)].value = KEYWORDS  #keywords
                ws['Y' + str(START_COL)].value = DESCRIPTION  #discription

                START_COL = START_COL + 1

    ## 最終行の枠線調整
    for i in range(ROW_M, ROW_V):
        if i == ROW_M:
            ws[str(chr(i)) + str(START_COL)].border = Border(
                top=Side(style='thin', color='000000'),
                bottom=Side(style='thin', color='000000'),
                left=Side(style='thin', color='000000'),
            )
        elif i == ROW_V:
            ws[str(chr(i)) + str(START_COL)].border = Border(
                top=Side(style='thin', color='000000'),
                bottom=Side(style='thin', color='000000'),
                right=Side(style='thin', color='000000'),
            )
        else:
            ws[str(chr(i)) + str(START_COL)].border = Border(
                top=Side(style='thin', color='000000'),
                bottom=Side(style='thin', color='000000'),
            )

    ## Excel保存
    wb.save('./filelist.xlsx')
Example #10
0
    def run(self):
        confidence = 0
        size = os.stat(self.file_name).st_size
        if BINARY.search(self.file_name):
            encoding = 'BINARY'
            confidence = 1
        elif size > 1048576 and maybe_binary(self.file_name):
            encoding = 'BINARY'
            confidence = 0.7
        elif size > 1048576:  # skip files > 1Mb
            encoding = 'Unknown'
            confidence = 1
        else:
            started_at = time.time()
            timeout = False

            detector = UniversalDetector()
            fp = open(self.file_name, 'rb')
            line = fp.readline(500)
            while line != '':
                detector.feed(line)
                if time.time() - started_at > 8:
                    timeout = True
                    break
                line = fp.readline(8000)
            fp.close()
            detector.close()
            if timeout == False or (timeout == True and detector.done):
                encoding = str(detector.result['encoding']).upper()
                confidence = detector.result['confidence']
            else:
                encoding = 'Unknown'
                confidence = 1

            if encoding == 'ASCII':
                encoding = 'UTF-8'
            elif encoding == None or encoding == 'NONE' or encoding == '' or encoding == 'Unknown' or confidence < 0.7:
                if encoding == 'ISO-8859-2' and confidence > 0.69:
                    workaround = self.test_fallback_encodings(
                        ['UTF-8', 'ISO-8859-1'])
                    if workaround != False:
                        encoding = workaround
                    else:
                        encoding = 'Unknown'
                elif encoding != 'ISO-8859-2' and confidence > 0.49:
                    if encoding == 'WINDOWS-1252':
                        encoding = 'ISO-8859-1'
                else:
                    fallback = self.test_fallback_encodings()
                    if fallback == False:
                        encoding = 'Unknown'
                    else:
                        encoding = fallback

            # workarounds here
            if encoding == 'ISO-8859-2' or encoding == 'MACCYRILLIC':
                workaround = self.test_fallback_encodings(
                    ['UTF-8', 'ISO-8859-1'])
                if workaround != False:
                    encoding = workaround

            del detector
        sublime.set_timeout(lambda: self.callback(encoding, confidence), 0)
def getFileEncoding(filePath):
    detector = UniversalDetector()
    f = open(filePath, 'rb')
    detector.feed(f.read())
    detector.close()
    return detector.result['encoding']
Example #12
0
 def __init__(self, filename):
     self.detector = UniversalDetector()
     self.encodingFormat(filename)
     self.decoderFile(filename)
Example #13
0
def read_into_dataframe(file: IO,
                        filename: str = "",
                        nrows: int = 100,
                        max_characters: int = 50) -> pd.DataFrame:
    """Reads a file into a DataFrame.
    Infers the file encoding and whether a header column exists
    Args:
        file (IO): file buffer.
        filename (str): filename. Used to infer compression.
        nrows (int, optional): number of rows to peek. Default: 100.
        max_characters (int, optional): max characters a column name can have to be distinguished from a real text value
    Returns:
        A pandas.DataFrame.
    """
    detector = UniversalDetector()
    for line, text in enumerate(file):
        detector.feed(text)
        if detector.done or line > nrows:
            break
    detector.close()
    encoding = detector.result.get("encoding")

    compression = infer_compression(filename, "infer")

    file.seek(0, SEEK_SET)
    contents = file.read()

    with BytesIO(contents) as file:
        df0 = pd.read_csv(
            file,
            encoding=encoding,
            compression=compression,
            sep=None,
            engine="python",
            header="infer",
            nrows=nrows,
        )

    df0_cols = list(df0.columns)

    #Check if all columns are strins and short strings(text values tend to be long)
    column_names_checker = all([type(item) == str for item in df0_cols])
    if column_names_checker:
        column_names_checker = all(
            [len(item) < max_characters for item in df0_cols])

    #Check if any column can be turned to float
    conversion_checker = True
    for item in df0_cols:
        try:
            item = float(item)
            conversion_checker = False
            break
        except:
            pass

    #Prefix and header
    final_checker = True if (column_names_checker
                             and conversion_checker) else False
    header = "infer" if final_checker else None
    prefix = None if header else "col"

    with BytesIO(contents) as file:
        df = pd.read_csv(
            file,
            encoding=encoding,
            compression=compression,
            sep=None,
            engine="python",
            header=header,
            prefix=prefix,
        )
    return df
def parse_dat_file(dat_path, spec_csv_path, out_folder):
    """Parse a .DAT file (CSPro fixed-width text datafile) into a series of CSV files 
    containing the tabular data for each table contained in the .DAT and described in the 
    associated .DCD file. 
    
    Developed for use in particular with DAT files provided in the "hierarchical data"
    from DHS, but may be more generally applicable to CSPro format files. The .DCF file 
    must be parsed first, using DCF_Parser, and the table specification file it 
    generates is used by this function to parse the data file.
    
    Produces one CSV data file for every table (recordtype) defined in the .DCF and occurring in 
    the .DAT. """
    filecode = os.path.extsep.join(os.path.basename(dat_path).split(os.path.extsep)[:-1])

    # See if we've already done this one
    test_fn = os.path.join(out_folder, f"{filecode}.REC01.csv")
    if os.path.exists(test_fn):
        print("Already parsed " + filecode)
        return
    print("Parsing "+dat_path)

    # read the parsed file specification in CSV form which was created by parsing the .dcf file
    # The first row specifies where, on all subsequent rows, the "record type" is found i.e. the identifier
    # that specifies which table the row defines a variable for. This is constant throughout the file.
    # Each remaining item in the parsed DCF spec defines one field from one table, specifying what position that
    # field's data is found in the fixed-width text format row when the row's record_type_info
    # (destination table name) is for this table
    with open(spec_csv_path, 'r') as dict_file:
        dict_file_reader = csv.DictReader(dict_file)
        # the record type position info must be in the first line
        recordtype_info = next(dict_file_reader)
        rt_start = int(recordtype_info['Start']) - 1
        rt_end = int(recordtype_info['Len']) + rt_start
        all_vars_this_file = [row for row in dict_file_reader]
    for field_info in all_vars_this_file:
        field_info['Start'] = int(field_info['Start'])
        field_info['Len'] = int(field_info['Len'])
    # sort them by record type (i.e. destination table) then position in the row (order of fields)
    sorted_fields = sorted(all_vars_this_file, key=(itemgetter('RecordTypeValue', 'Start')))

    # build a dictionary of record type (i.e. tablename) : list of its fields (i.e. field infos)
    rt_field_info = {}
    for field_info in sorted_fields:
        record_tag = field_info['RecordTypeValue']
        if record_tag not in rt_field_info:
            rt_field_info[record_tag] = []
        rt_field_info[record_tag].append(field_info)

    # now parse the data file
    result = {}
    n_cols_per_table = {}

    detector = UniversalDetector()
    with open(dat_path, 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done: break
        detector.close()
        enc = detector.result['encoding']

    with open(dat_path, 'r', encoding=enc) as data:
        for i, line in enumerate(data):
            #if i == 0 and line.startswith(codecs.BOM_UTF8):
            #    print(f"File {dat_path} appears to contain BOM; ignoring it")
            #    line = line[len(codecs.BOM_UTF8):]
            record_type = line[rt_start:rt_end]
            if record_type not in rt_field_info:
                print("Specification for recordtype '{0!s}' not found in file for {1!s} at line {2!s}".format(
                    record_type, filecode, i))
                continue
            record_spec = rt_field_info[record_type]
            if record_type not in result:
                result[record_type] = []

            # split the column-aligned text according to the row specification

            # The .DAT format allows a fixed width for each column of each recordtype.
            # Should we strip the whitespace on shorter values? This is difficult.
            # In general, yes we should, because values are stored as fixed-width and where 
            # shorter than the field, are padded with spaces, which would take up unnecessary space 
            # and would prevent joining/comparison between surveys. 
            # HOWEVER in the case of the CASEID / HHID variables we must NOT strip the whitespace. 
            # The HHID is usually the CASEID with the last 3 chars trimmed off, but if we
            # trim "some" whitespace from HHID here then we can break that association and
            # damage referential integrity.
            # On the other hand some joins are based on e.g. BIDX (recorded as len 2)
            # to MIDX (recorded as len 1, despite containing the same data), and we need
            # to join on a single digit found in both so BIDX would need to be stripped.

            # Define a lambda to strip or not strip accordingly, and use it in a list comp to
            # split the row into its field values
            strip_or_not = lambda data, name: data if name in ('CASEID', 'HHID') else data.strip()
            rowParts = [strip_or_not(
                (line[i['Start'] - 1: i['Start'] + i['Len'] - 1]),
                i['Name'])
                for i in record_spec]

            if record_type not in n_cols_per_table:
                n_cols_per_table[record_type] = len(rowParts)
            else:
                assert len(rowParts) == n_cols_per_table[record_type]
            # add as a list to the list of rows for this record type
            result[record_type].append(rowParts)  # (",".join(rowParts))

    for record_type, field_infos in rt_field_info.items():
        if not record_type in result:
            print(f"No rows were found for record type {record_type} in file {filecode} despite DCF specification")
            continue
        field_header = [i['Name'] for i in field_infos]
        field_records = set([i['RecordName'] for i in field_infos])
        assert len(field_records) == 1
        rec_name = field_records.pop()
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
        out_fn = os.path.join(out_folder, f"{filecode}.{rec_name}.csv")
        with open(out_fn, 'w', newline='', encoding='utf-8') as out_csv:
            csv_writer = csv.writer(out_csv)
            csv_writer.writerow(field_header)
            csv_writer.writerows(result[record_type])
 def __init__(self):
     self.detector = UniversalDetector()
Example #16
0
def detect_file_encoding(training_file,
                         file_encoding,
                         max_passwords=10000,
                         default='utf-8'):
    print()
    print("Attempting to autodetect file encoding of the file: " +
          str(training_file),
          file=sys.stderr)
    print("-----------------------------------------------------------------")

    ##--Try to import chardet.
    ##--If that package is not installed print out a warning and use the default
    try:
        from chardet.universaldetector import UniversalDetector
        detector = UniversalDetector()
    except ImportError as error:
        print("FAILED: chardet not installed", file=sys.stderr)
        print("IT IS HIGHLY RECOMMENDED THAT YOU INSTALL THE chardet PACKAGE",
              file=sys.stderr)
        print(
            "or manually specify the file encoding of the training set via the command line",
            file=sys.stderr)
        print(
            "You can download chardet from https://pypi.python.org/pypi/chardet",
            file=sys.stderr)
        print("Defaulting as " + default, file=sys.stderr)
        file_encoding.append(default)
        return True

    ##--Read through up to the number specified in 'max_passwords' to identify the character encoding
    try:
        cur_count = 0
        with open(training_file, 'rb') as file:
            for line in file.readlines():
                detector.feed(line)
                if detector.done:
                    break
                cur_count = cur_count + 1
                if cur_count >= max_passwords:
                    break
            detector.close()
    except IOError as error:
        print("Error opening file " + training_file)
        print("Error is " + str(error))
        return False

    try:
        file_encoding.append(detector.result['encoding'])
        print("File Encoding Detected: " + str(detector.result['encoding']),
              file=sys.stderr)
        print("Confidence for file encoding: " +
              str(detector.result['confidence']),
              file=sys.stderr)
        print(
            "If you think another file encoding might have been used please manually specify the file encoding and run the training program again",
            file=sys.stderr)
        print()
    except KeyError as error:
        print("Error encountered with file encoding autodetection",
              file=sys.stderr)
        print("Error : " + str(error))
        return False

    return True
Example #17
0
def File_import(Folder_path):

    df_list = []
    detector = UniversalDetector()
    file_list = glob(os.path.join(Folder_path, '*.csv'))

    for f in file_list:
        detector.reset()
        for line in open(f, 'rb'):
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        print(detector.result['encoding'])
        try:
            try:
                _df = pd.DataFrame()
                _df = pd.read_csv(
                    f, header=1, skipfooter=1,
                    encoding=detector.result['encoding'], engine='python')
            except:
                _df = pd.DataFrame()
                _df = pd.read_csv(
                    f, header=1, skipfooter=1, encoding='cp932',
                    engine='python')
        except:
            print(f)
            sys.exit('error: cannot open.')
        if _df.shape[1] == 1:
            try:
                _df = pd.DataFrame()
                _df = pd.read_csv(
                    f, header=0, encoding=detector.result['encoding'])
            except:
                _df = pd.DataFrame()
                _df = pd.read_csv(
                    f, header=0, encoding=detector.result['encoding'])
        df_list.append(_df)

    df_165 = pd.DataFrame()
    df_167 = pd.DataFrame()
    df_append = pd.DataFrame()

    for df in df_list:
        if df.shape[1] == 104:
            df_165 = pd.concat([df_165, df], axis=0)
        elif df.shape[1] == 101:
            df_167 = pd.concat([df_167, df], axis=0)
        elif df.shape[1] == 1:
            df_append = pd.concat([df_append, df], axis=0)

        if set(df_167.columns) != set(df.columns):
            print(df_167.columns)
            print(df.columns)

    df_165.reset_index(inplace=True)
    df_167.reset_index(inplace=True)
    print('165:{0}, 167:{1}, append:{2}'.format(df_165.shape, df_167.shape, df_append.shape))

    if df_165.shape[0] == 0 and df_167.shape[0] == 0:
        sys.exit('error: 0 csv file import.')

    return df_165, df_167, df_append
Example #18
0
    def iter_files(old_kw, new_kw, root_dir, **kwargs):
        """
        遍历根目录
        :param old_kw: 旧词
        :param new_kw: 新词
        :param root_dir: 目录的绝对路径
        :param kwargs: 自定义参数
        """
        if not old_kw:
            raise Exception("原有关键词为空,请输入!")
        isdir = os.path.isdir(root_dir)
        if not isdir:
            raise Exception("找不到该目录!请检查路径是否正确!")

        keywords_dict = {}
        for k, v in kwargs.items():
            keywords_dict[k] = v

        detector = UniversalDetector()
        for root, dirs, files in os.walk(root_dir, topdown=False):
            # 替换文件内容
            if keywords_dict['content'] == 1:
                # 先遍历最内层,逐步向上
                for file_name in files:
                    old_file_path = os.path.join(root, file_name)
                    file_data = ""
                    # 读该文件编码格式,重置对象
                    detector.reset()
                    with open(old_file_path, 'rb') as file:
                        for line in file.readlines():
                            detector.feed(line)
                            if detector.done:
                                break
                        curr_encode = detector.result['encoding']
                    # 如果被替换的字符串在文件内容中,先按行读出来,在替换
                    with open(old_file_path,
                              'r',
                              encoding=curr_encode,
                              errors='ignore') as f:
                        for line in f.readlines():
                            new_line = line.replace(old_kw, new_kw)
                            file_data += new_line
                    with open(old_file_path,
                              'w',
                              encoding=curr_encode,
                              errors='ignore') as f:
                        f.write(file_data)

            # 替换文件名称
            if keywords_dict['filename'] == 1:
                for file_name in files:
                    old_file_path = os.path.join(root, file_name)
                    # 如果被替换的字符串在文件的名中,则替换成新的
                    if old_kw in file_name:
                        new_file_name = file_name.replace(old_kw, new_kw)
                        new_file_path = os.path.join(root, new_file_name)
                        os.rename(old_file_path, new_file_path)

            # 替换文件夹名称
            if keywords_dict['dirname'] == 1:
                for dir_name in dirs:
                    old_dir_path = os.path.join(root, dir_name)
                    # 如果被替换的字符串在文件夹的名中,则替换成新的
                    if old_kw in dir_name:
                        new_dir_name = dir_name.replace(old_kw, new_kw)
                        new_dir_path = os.path.join(root, new_dir_name)
                        os.rename(old_dir_path, new_dir_path)
def opener(file, prefix='texts/'):
    detector = UniversalDetector()
    detector.feed(open(prefix + file, 'rb').read())
    detector.close()
    return open(prefix + file, encoding=detector.result['encoding']).read()
Example #20
0
    def parse_csv(self, file_id, skip_header=False):
        """
    Attempt to parse a previously uploaded file as a table or spreadsheet. Generate rows as they're
    requested.

    @type file_id: unicode
    @param file_id: id of the file to parse
    @type skip_header: bool
    @param skip_header: if a line of header labels is detected, don't include it in the generated
                        rows (defaults to False)
    @rtype: generator
    @return: rows of data from the parsed file. each row is a list of elements
    @raise Parse_error: there was an error in parsing the given file
    """
        APPROX_SNIFF_SAMPLE_SIZE_BYTES = 1024 * 50

        try:
            import csv

            table_file = Upload_file.open_file(file_id)
            table_file.seek(
                0
            )  # necessary in case the file is opened by another call to parse_csv()
            sniffer = csv.Sniffer()

            # attempt to determine the presence of a header
            lines = table_file.readlines(APPROX_SNIFF_SAMPLE_SIZE_BYTES)
            sniff_sample = "".join(lines)

            has_header = sniffer.has_header(sniff_sample)

            # attempt to determine the file's character encoding
            detector = UniversalDetector()
            for line in lines:
                detector.feed(line)
                if detector.done: break

            detector.close()
            encoding = detector.result.get("encoding")

            table_file.seek(0)
            reader = csv.reader(table_file)

            # skip the header if requested to do so
            if has_header and skip_header:
                reader.next()

            expected_row_length = None

            for row in reader:
                # all rows must have the same number of elements
                current_row_length = len(row)
                if current_row_length == 0:
                    continue

                if expected_row_length and current_row_length != expected_row_length:
                    raise Parse_error()
                else:
                    expected_row_length = current_row_length

                yield [element.decode(encoding) for element in row]
        except (csv.Error, IOError, TypeError):
            raise Parse_error()
def parse_csv(myfile, newsletter, ignore_errors=False):
    from newsletter.addressimport.csv_util import UnicodeReader
    import codecs
    import csv

    # Detect encoding
    from chardet.universaldetector import UniversalDetector

    detector = UniversalDetector()

    for line in myfile.readlines():
        detector.feed(line)
        if detector.done:
            break

    detector.close()
    charset = detector.result['encoding']

    # Reset the file index
    myfile.seek(0)

    # Attempt to detect the dialect
    encodedfile = codecs.EncodedFile(myfile, charset)
    dialect = csv.Sniffer().sniff(encodedfile.read(1024))

    # Reset the file index
    myfile.seek(0)

    logger.info('Detected encoding %s and dialect %s for CSV file', charset,
                dialect)

    myreader = UnicodeReader(myfile, dialect=dialect, encoding=charset)

    firstrow = myreader.next()

    # Find name column
    colnum = 0
    namecol = None
    for column in firstrow:
        if "name" in column.lower() or ugettext("name") in column.lower():
            namecol = colnum

            if "display" in column.lower() or \
                    ugettext("display") in column.lower():
                break

        colnum += 1

    if namecol is None:
        raise forms.ValidationError(
            _("Name column not found. The name of this column should be "
              "either 'name' or '%s'.") % ugettext("name"))

    logger.debug("Name column found: '%s'", firstrow[namecol])

    # Find email column
    colnum = 0
    mailcol = None
    for column in firstrow:
        if 'email' in column.lower() or \
                'e-mail' in column.lower() or \
                ugettext("e-mail") in column.lower():

            mailcol = colnum

            break

        colnum += 1

    if mailcol is None:
        raise forms.ValidationError(
            _("E-mail column not found. The name of this column should be "
              "either 'email', 'e-mail' or '%(email)s'.") %
            {'email': ugettext("e-mail")})

    logger.debug("E-mail column found: '%s'", firstrow[mailcol])

    #assert namecol != mailcol, \
    #    'Name and e-mail column should not be the same.'
    if namecol == mailcol:
        raise forms.ValidationError(
            _("Could not properly determine the proper columns in the "
              "CSV-file. There should be a field called 'name' or "
              "'%(name)s' and one called 'e-mail' or '%(e-mail)s'.") % {
                  "name": _("name"),
                  "e-mail": _("e-mail")
              })

    logger.debug('Extracting data.')

    addresses = {}
    for row in myreader:
        if not max(namecol, mailcol) < len(row):
            logger.warn("Column count does not match for row number %d",
                        myreader.line_num,
                        extra=dict(data={'row': row}))

            if ignore_errors:
                # Skip this record
                continue
            else:
                raise forms.ValidationError(
                    _("Row with content '%(row)s' does not contain a name and "
                      "email field.") % {'row': row})

        name = check_name(row[namecol], ignore_errors)
        email = check_email(row[mailcol], ignore_errors)

        logger.debug("Going to add %s <%s>", name, email)

        try:
            validate_email(email)
            addr = make_subscription(newsletter, email, name)
        except ValidationError:
            if ignore_errors:
                logger.warn(
                    "Entry '%s' at line %d does not contain a valid "
                    "e-mail address.",
                    name,
                    myreader.line_num,
                    extra=dict(data={'row': row}))
            else:
                raise forms.ValidationError(
                    _("Entry '%s' does not contain a valid "
                      "e-mail address.") % name)

        if addr:
            if email in addresses:
                logger.warn(
                    "Entry '%s' at line %d contains a "
                    "duplicate entry for '%s'",
                    name,
                    myreader.line_num,
                    email,
                    extra=dict(data={'row': row}))

                if not ignore_errors:
                    raise forms.ValidationError(
                        _("The address file contains duplicate entries "
                          "for '%s'.") % email)

            addresses.update({email: addr})
        else:
            logger.warn(
                "Entry '%s' at line %d is already subscribed to "
                "with email '%s'",
                name,
                myreader.line_num,
                email,
                extra=dict(data={'row': row}))

            if not ignore_errors:
                raise forms.ValidationError(
                    _("Some entries are already subscribed to."))

    return addresses
    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.files)
                or not (self.file or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[self.file, self.encoding, u'', u'']]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myFiles))

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]

            # Try to open the file...
            self.error()
            try:
                if encoding == "(auto-detect)":
                    detector = UniversalDetector()
                    fh = open(filePath, 'rb')
                    for line in fh:
                        detector.feed(line)
                        if detector.done: break
                    detector.close()
                    fh.close()
                    encoding = detector.result['encoding']
                fh = open(
                    filePath,
                    mode='rU',
                    encoding=encoding,
                )
                try:
                    fileContent = ""
                    i = 0
                    chunks = list()
                    for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""):
                        chunks.append('\n'.join(chunk.splitlines()))
                        i += CHUNK_LENGTH
                        if i % (CHUNK_NUM * CHUNK_LENGTH) == 0:
                            fileContent += "".join(chunks)
                            chunks = list()
                    if len(chunks):
                        fileContent += "".join(chunks)
                    del chunks
                except UnicodeError:
                    progressBar.finish()
                    if len(myFiles) > 1:
                        message = u"Please select another encoding "    \
                                  + u"for file %s." % filePath
                    else:
                        message = u"Please select another encoding."
                    self.infoBox.setText(message, 'error')
                    self.send('Text data', None, self)
                    self.controlArea.setDisabled(False)
                    return
                finally:
                    fh.close()
            except IOError:
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Example #23
0
    def GetFileBody(self, get):
        if sys.version_info[0] == 2: get.path = get.path.encode('utf-8')
        if not os.path.exists(get.path):
            if get.path.find('rewrite') == -1:
                return public.returnMsg(False, 'FILE_NOT_EXISTS', (get.path, ))
            public.writeFile(get.path, '')

        if os.path.getsize(get.path) > 2097152:
            return public.returnMsg(False, 'CANT_EDIT_ONLINE_FILE')
        fp = open(get.path, 'rb')
        data = {}
        data['status'] = True

        try:
            if fp:
                from chardet.universaldetector import UniversalDetector
                detector = UniversalDetector()
                srcBody = b""
                for line in fp.readlines():
                    detector.feed(line)
                    srcBody += line
                detector.close()
                char = detector.result
                data['encoding'] = char['encoding']
                if char['encoding'] == 'GB2312' or not char[
                        'encoding'] or char['encoding'] == 'TIS-620' or char[
                            'encoding'] == 'ISO-8859-9':
                    data['encoding'] = 'GBK'
                if char['encoding'] == 'ascii' or char[
                        'encoding'] == 'ISO-8859-1':
                    data['encoding'] = 'utf-8'
                if char['encoding'] == 'Big5': data['encoding'] = 'BIG5'
                if not char['encoding'] in ['GBK', 'utf-8', 'BIG5']:
                    data['encoding'] = 'utf-8'
                try:
                    if sys.version_info[0] == 2:
                        data['data'] = srcBody.decode(data['encoding']).encode(
                            'utf-8', errors='ignore')
                    else:
                        data['data'] = srcBody.decode(data['encoding'])
                except:
                    data['encoding'] = char['encoding']
                    if sys.version_info[0] == 2:
                        data['data'] = srcBody.decode(data['encoding']).encode(
                            'utf-8', errors='ignore')
                    else:
                        data['data'] = srcBody.decode(data['encoding'])
            else:
                if sys.version_info[0] == 2:
                    data['data'] = srcBody.decode('utf-8').encode('utf-8')
                else:
                    data['data'] = srcBody.decode('utf-8')
                data['encoding'] = u'utf-8'

            return data
        except Exception as ex:
            return public.returnMsg(
                False,
                'INCOMPATIBLE_FILECODE',
                (str(ex)),
            )
Example #24
0
def getEncodingByContent(content):
    detector = UniversalDetector()
    detector.feed(content)
    detector.close()
    return detector.result["encoding"]
Example #25
0
def main(argv):
	
	# Collect command options
	try:
		opts, args = getopt.getopt(argv,"ri:o:",["ifile=","odir="])
	except getopt.GetoptError:
		print('Failed with arguments', argv)
		sys.exit(2)

	# Read command options
	input_file = ''
	output_directory = ''
	recurse = False
	for opt, arg in opts:
		if opt == '-r':
			recurse = True
		elif opt in ("-i", "--ifile"):
			input_file = arg
		elif opt in ("-o", "--odir"):
			output_directory = arg

	# Set default values
	if input_file == '':
		input_file = '.'
	if output_directory == '':
		output_directory = './yeet/'

	# Determine absolute paths
	input_path = os.path.abspath(input_file)
	output_directory = os.path.abspath(output_directory)

	# Make sure the input file or directory exists
	if not os.path.exists(input_path):
		print('The input path {} does not exist.'.format(input_path))
		sys.exit(1)

	# Determine which files will be used as input
	inputs = []
	if os.path.isdir(input_path):
		input_directory = input_path
		for extension in ('h', 'hpp', 'c', 'cpp'):
			
			if recurse: # Check input and sub directories
				path = os.path.join(input_path, '**/*.' + extension)
				files = glob.glob(path, recursive=True)
			else:
				# Check input directory only
				path = os.path.join(input_path, '*.' + extension)
				files = glob.glob(path)
			
			inputs.extend(files)
	else:
		input_directory = os.path.dirname(input_path)
		inputs.append(input_path)

	yeet_table = {}
	yeet_generator = YeetGenerator()

	detector = UniversalDetector()
	for input_file in inputs:

		print('Yeeting {}'.format(input_file))

		# Attempt to find the encoding of the file that is being read.
		try:
			detector.reset()
			for line in open(input_file, 'rb'):
				detector.feed(line)
				if detector.done: break
			detector.close()
		except UnicodeDecodeError:
			print('Could not decode file {}'.format(input_file))
			detector.close()
			continue

		try:
			with open(input_file, "r", encoding=detector.result['encoding']) as fi:
				file_string = fi.read()
		except UnicodeDecodeError:
			print('Could not read file {} \n Expected encode was: {}'.format(input_file, detector.result['encoding']))
			continue

		yeeted_file_string = yeet_file(file_string, yeet_table, yeet_generator)

		file_name = input_file.replace(input_directory, '')[1:]
		output_file_path = os.path.join(output_directory, file_name)

		try:
			os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

			yeet_header_path = get_yeet_path(output_directory, output_file_path)

			with open(output_file_path, "w") as fo:
				fo.write('#include "{}"\n'.format(yeet_header_path))
				fo.write(yeeted_file_string)

		except OSError as e:
			print('Could not yeet file {} to {}'.format(input_file, output_file_path))
			print(e)


	# Output header file with the macro definitions for all of the input files
	try:
		yeetfile = os.path.join(output_directory, 'yeet.h')
		with open(yeetfile, "w") as fo:

			for token in yeet_table.keys():
				fo.write('#define {} {}\n'.format(yeet_table[token], token))

	except OSError as e:
		print('Could not yeet file yeet.h to {}'.format(yeetfile))
		print(e)
Example #26
0
import codecs
import textract
from chardet.universaldetector import UniversalDetector
import os
import freeling
import sys

DETECTOR = UniversalDetector()


def convert_to_utf8(filename):
    global DETECTOR
    DETECTOR.reset()
    with open(filename, 'rb') as f:
        start = f.read(3)
        f.seek(0)
        for line in f:
            DETECTOR.feed(line)
            if DETECTOR.done:
                break
    DETECTOR.close()
    encoding = DETECTOR.result["encoding"]
    if encoding != "UTF-8":
        os.system('iconv -f %s -t UTF-8 "%s" > "%s.utf8"' %
                  (encoding, filename, filename))
        os.system('mv "%s.utf8" "%s"' % (filename, filename))
    elif start == codecs.BOM_UTF8:
        os.system('tail --bytes=+4 "%s" > "%s.utf8wobom"' %
                  (filename, filename))
        os.system('mv "%s.utf8wobom" "%s"' % (filename, filename))
Example #27
0
    def determineEncoding(self, chardet=True):
        # BOMs take precedence over everything
        # This will also read past the BOM if present
        charEncoding = self.detectBOM(), "certain"
        if charEncoding[0] is not None:
            return charEncoding

        # If we've been overridden, we've been overridden
        charEncoding = lookupEncoding(self.override_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding

        # Now check the transport layer
        charEncoding = lookupEncoding(self.transport_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding

        # Look for meta elements with encoding information
        charEncoding = self.detectEncodingMeta(), "tentative"
        if charEncoding[0] is not None:
            return charEncoding

        # Parent document encoding
        charEncoding = lookupEncoding(
            self.same_origin_parent_encoding), "tentative"
        if charEncoding[0] is not None and not charEncoding[0].name.startswith(
                "utf-16"):
            return charEncoding

        # "likely" encoding
        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding

        # Guess with chardet, if available
        if chardet:
            try:
                from chardet.universaldetector import UniversalDetector
            except ImportError:
                pass
            else:
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
                if encoding is not None:
                    return encoding, "tentative"

        # Try the default encoding
        charEncoding = lookupEncoding(self.default_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding

        # Fallback to html5lib's default if even that hasn't worked
        return lookupEncoding("windows-1252"), "tentative"
import os
import urllib.request
from bs4 import BeautifulSoup
import json
from chardet.universaldetector import UniversalDetector  # https://chardet.readthedocs.io/en/latest/usage.html#example-using-the-detect-function
import html5lib  # for BeautifulSoup parser

encode_detector = UniversalDetector()
if not os.path.isfile('./config/FSF-licenses-full.json'):
    try:
        with urllib.request.urlopen(
                'https://wking.github.io/fsf-api/licenses-full.json') as res:
            body = res.read()
        encode_detector.reset()
        encode_detector.feed(body)
        if encode_detector.done:
            encode_detector.close()
            raw_doc = body.decode(
                encode_detector.result['encoding'],
                errors='ignore')  # .encode('utf-8', 'ignore')
        else:
            encode_detector.close()
            raw_doc = body.decode('utf-8', errors='ignore')
        f = open("./config/FSF-licenses-full.json", "w", encoding='utf-8')
        f.write(raw_doc)
        f.close()
        license_metaData = json.loads(raw_doc)
    except urllib.error.HTTPError as err:
        print('licenses.json get failed', err)
        exit(1)
    except urllib.error.URLError as err:
Example #29
0
import urllib
from chardet.universaldetector import UniversalDetector

usock = urllib.urlopen('file:///C:/tmp/enem/sql-CRIACAO-TABELAS.SQL')
detector = UniversalDetector()
for line in usock.readlines():
    detector.feed(line)
    if detector.done: break
detector.close()
usock.close()
print detector.result
Example #30
0
    def show_encode(text):
        detector = UniversalDetector()
        detector.feed(text)
        detector.close()

        return detector.result['encoding']