Example #1
0
def stats_text_cn(text, count=10, print_text=False):
    """统计中文字字频并按字频从高到低排序
    
    Parameter:
    text -- 即将被处理的文本,必须为字符串
    Keyword Argument:
    print_text -- 选择是否打印结果,默认为False不打印结果
    count -- 输出元素的个数,默认为10个。若count<=0则全部打印。必须为int类型。
    Return Value:
    返回一个字典,key为汉字字符,value为字频
    Potential Bug:
    超出了基本汉字集的汉字(如汉字补充、汉字偏旁部首等)不会被字典收录
    """
    if not isinstance(count, int):  #检查count是否为int类型
        raise TypeError('count should be int!')

    if not isinstance(text, str):
        raise ValueError('It\'s not a string!')
    text = chinese_only(text)  #只留下中文字符
    text = [x for x in jieba.cut(text, cut_all=False)
            if len(x) >= 2]  #利用结巴精确模式进行分词,输出长度大于2的中文
    if count <= 0:
        text = list(text)  #将中文字符转换为中文列表
        text_dict = list_to_dict_and_cal(text)  #将text转化为字典并统计词频
        text_dict = sorted(text_dict.items(),
                           key=lambda item: item[1],
                           reverse=True)  #对字典按照value值排序
    else:  #利用typing里的Counter处理文本
        text_dict = typing.Counter(text).most_common(count)
    if print_text:  #如果要求打印,则打印结果
        print(text_dict)
    return text_dict
Example #2
0
def stats_text_en(text, count=10, print_text=False):
    """统计英文单词词频并按词频从高到低排序
    
    Parameter:
    text -- 即将被处理的文本,必须为字符串
    Keyword Argument:
    print_text -- 选择是否打印结果,默认为False不打印结果
    count -- 输出元素的个数,默认为10个。若count<=0则全部打印。必须为int类型。
    Return Value:
    返回一个字典,key为单词字符串,value为词频
    Potential Bug:
    若英文字符和中文字符连在一起,则不被字典收录
    """
    if not isinstance(count, int):  #检查count是否为int类型
        raise TypeError('count should be int!')

    if not isinstance(text, str):
        raise ValueError('It\'s not a string!')
    text = cut_and_clean(text)  #切分字符串并清洗标点符号
    text = english_only(text)  #除去非英文字符串
    if count <= 0:
        text_dict = list_to_dict_and_cal(text)  #将text转化为字典并统计词频
        text_dict = sorted(text_dict.items(),
                           key=lambda item: item[1],
                           reverse=True)  #对字典按照value值排序
    else:  #利用typing里的Counter处理文本
        text_dict = typing.Counter(text).most_common(count)
    if print_text:  #如果要求打印,则打印结果
        print(text_dict)
    return text_dict
def use_jieba_calculate(template):
    text = chinese_only(template)
    seg_list = jieba.cut(text, cut_all=False)
    str_list = []
    for item in seg_list:
        if (len(item) >= 2):
            str_list.append(item)
    text_dict = list_to_dict_and_cal(str_list)
    text_dict = sorted(text_dict.items(),
                       key=lambda item: item[1],
                       reverse=True)  #对字典按照value值排序
    text_dict = typing.Counter(text_dict).most_common(20)
    return text_dict
def stats_text_cn(text, numbers, print_text=False):
    """统计中文字字频并按字频从高到低排序
    
    Parameter:
    text -- 即将被处理的文本,必须为字符串
    Keyword Argument:
    print_text -- 选择是否打印结果,默认为False不打印结果
    Return Value:
    返回一个字典,key为汉字字符,value为字频
    Potential Bug:
    超出了基本汉字集的汉字(如汉字补充、汉字偏旁部首等)不会被字典收录
    """
    text = chinese_only(text)
    text = list(text)  #将中文字符串转换为中文列表
    text_dict = list_to_dict_and_cal(text)  #统计中文字频
    text_dict = sorted(text_dict.items(),
                       key=lambda item: item[1],
                       reverse=True)  #对字典按照value值排序
    text_dict = typing.Counter(text_dict).most_common(numbers)
    if print_text:  #如果要求打印,则打印结果
        print(text_dict)
    return text_dict
def stats_text_en(text, numbers, print_text=False):
    """统计英文单词词频并按词频从高到低排序
    
    Parameter:
    text -- 即将被处理的文本,必须为字符串
    Keyword Argument:
    print_text -- 选择是否打印结果,默认为False不打印结果
    Return Value:
    返回一个字典,key为单词字符串,value为词频
    Potential Bug:
    若英文字符和中文字符连在一起,则不被字典收录
    """

    text = cut_and_clean(text)  #切分字符串并清洗标点符号
    text = english_only(text)  #除去非英文字符串
    text_dict = list_to_dict_and_cal(text)  #将text转化为字典并统计词频
    text_dict = sorted(text_dict.items(),
                       key=lambda item: item[1],
                       reverse=True)  #对字典按照value值排序
    text_dict = typing.Counter(text_dict).most_common(numbers)
    if print_text:  #如果要求打印,则打印结果
        print(text_dict)
    return text_dict
Example #6
0
def run_on_one_tile(lon: float, lat: float, db: sqlalchemy.engine.Engine,
                    hex: sqlalchemy.Table,
                    t_dist: sqlalchemy.Table) -> t.Set[ArrayIndex]:
    """Compute pairwise distances and ecoregion composition

    Given a digital elevation model as raster map and a matching ecoregions
    raster map, compute the pairwise distance to its 1-hex and 2-hex neighbors
    for every H3 address hex at standard resolution, as well as the approximate
    cover of that cell in terms of ecoregions, for all cells where that is
    possible.

    Distances are computed in gross hours of travel while navigating off-track,
    following [@irmischer2018measuring].

    Returns
    =======
    d: A mapping. d[h1][h2] is the distance, in hours, from the center of h1 to
        the center of h2.
    e: A mapping. d[h1][b] is the proportion of hex h1 covered by ecoregion b.

    """
    print("Working on hex around ({:}, {:}):".format(lon, lat))
    elevation_file = gmted_tile_from_geocoordinates(lon, lat)
    m_trafo = elevation_file.transform
    height, width = elevation_file.shape
    elevation = numpy.full((height + 1000, width + 1000), -100, int)
    ecoregions = numpy.full((height + 1000, width + 1000), 999, int)
    elevation[500:-500, 500:-500] = elevation_file.read(1)
    ecoregions[500:-500,
               500:-500] = ecoregion_tile_from_geocoordinates(lon, lat).read(1)
    print("Loading adjacent data…")
    try:
        elevation[:500, :500] = (gmted_tile_from_geocoordinates(
            lon - 30, lat + 20)).read(1)[-500:, -500:]
        ecoregions[:500, :500] = ecoregion_tile_from_geocoordinates(
            lon - 30, lat + 20).read(1)[-500:, -500:]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[:500, 500:-500] = (gmted_tile_from_geocoordinates(
            lon, lat + 20)).read(1)[-500:, :]
        ecoregions[:500, 500:-500] = ecoregion_tile_from_geocoordinates(
            lon, lat + 20).read(1)[-500:, :]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[:500, -500:] = (gmted_tile_from_geocoordinates(
            lon + 30, lat + 20)).read(1)[-500:, :500]
        ecoregions[:500, -500:] = ecoregion_tile_from_geocoordinates(
            lon + 30, lat + 20).read(1)[-500:, :500]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[500:-500, :500] = (gmted_tile_from_geocoordinates(
            lon - 30, lat)).read(1)[:, -500:]
        ecoregions[500:-500, :500] = ecoregion_tile_from_geocoordinates(
            lon - 30, lat).read(1)[:, -500:]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[500:-500, -500:] = (gmted_tile_from_geocoordinates(
            lon + 30, lat)).read(1)[:, :500]
        ecoregions[500:-500, -500:] = ecoregion_tile_from_geocoordinates(
            lon + 30, lat).read(1)[:, :500]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[-500:, :500] = (gmted_tile_from_geocoordinates(
            lon - 30, lat - 20)).read(1)[:500, -500:]
        ecoregions[-500:, :500] = ecoregion_tile_from_geocoordinates(
            lon - 30, lat - 20).read(1)[:500, -500:]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[-500:, 500:-500] = (gmted_tile_from_geocoordinates(
            lon, lat - 20)).read(1)[:500, :]
        ecoregions[-500:, 500:-500] = ecoregion_tile_from_geocoordinates(
            lon, lat - 20).read(1)[:500, :]
    except rasterio.RasterioIOError:
        pass
    try:
        elevation[-500:, -500:] = (gmted_tile_from_geocoordinates(
            lon + 30, lat - 20)).read(1)[:500, :500]
        ecoregions[-500:, -500:] = ecoregion_tile_from_geocoordinates(
            lon + 30, lat - 20).read(1)[:500, :500]
    except rasterio.RasterioIOError:
        pass

    print("Computing hex extents…")
    transform = rasterio.Affine(m_trafo.a, 0, m_trafo.c - 500 * m_trafo.a, 0,
                                m_trafo.e, m_trafo.f - 500 * m_trafo.e)

    def rowcol(latlon):
        lat, lon = latlon
        if lon > 170:
            # FIXME: We can and need to do this because we are working on the
            # Americas and the Americas only. The generic solution is more
            # difficult.
            lon = lon - 360
        col, row = ~transform * (lon, lat)
        return int(row), int(col)

    starts: t.List[h3.H3Index] = []
    cs = sqlalchemy.select(
        [hex.c.hexbin, hex.c.longitude, hex.c.latitude, hex.c.habitable])
    for h, lon, lat, habitable in db.execute(cs).fetchall():
        if not habitable:
            continue
        row, col = rowcol((lat, lon))
        if 500 <= col < width + 500 and 500 < row < height + 500:
            starts.append(h)

    print("Computing terrain coefficients…")
    terrain_coefficient_raster = TC[ecoregions]

    print("Computing distances on the grid…")
    distance_by_direction = all_pairwise_distances(elevation, transform,
                                                   terrain_coefficient_raster)

    print("Computing central nodes…")
    center = {}
    partial = set()
    belongs = {}
    for row in range(ecoregions.shape[0]):
        incomplete = set()
        for col in range(ecoregions.shape[1]):
            lon, lat = transform * (col, row)
            hexbin = h3.geo_to_h3(lat, lon, RESOLUTION)
            incomplete.add(hexbin)
            if row == 0 or col < 10 or ecoregions.shape[1] - 10 <= col:
                partial.add(hexbin)
                try:
                    del belongs[hexbin]
                except KeyError:
                    pass
            elif hexbin not in partial:
                belongs.setdefault(hexbin, set()).add((row, col))
        for hexbin in set(belongs) - incomplete:
            print(f"Checking {hexbin}…")
            points = belongs.pop(hexbin)
            if hexbin in partial:
                print("Not competely in this tile.")
                continue
            result = engine.execute(
                sqlalchemy.select([
                    hex.c.habitable, hex.c.vlongitude, hex.c.vlatitude,
                    hex.c.longitude, hex.c.latitude
                ]).where(hex.c.hexbin == hexbin)).fetchone()
            if not result:
                center[hexbin] = rowcol(h3.h3_to_geo(hexbin))
                print(
                    "WHAT IS THIS??? THIS HEX ({:}, {:}) IS NOT IN THE DB!!!!".
                    format(*center[hexbin]))
                continue
            h, vlon, vlat, lon, lat = result
            if vlon is not None and vlat is not None:
                print("Known in DB.")
                center[hexbin] = rowcol((vlat, vlon))
                continue
            if lon and lat and not h:
                print("Uninhabitable.")
                center[hexbin] = rowcol((lat, lon))
                continue
            print("Computing centralities…")
            engine.execute(hex.update().where(hex.c.hexbin == hexbin).values({
                "vlatitude":
                0.0,
                "vlongitude":
                0.0
            }))
            rmin = min(p[0] for p in points)
            rmax = max(p[0] for p in points)
            cmin = min(p[1] for p in points)
            cmax = max(p[1] for p in points)
            assert rmin >= 0
            assert cmin >= 0

            dist = {(n, e): d[rmin - min(n, 0):rmax + 1 - max(0, n),
                              cmin - min(e, 0):cmax + 1 - max(0, e)]
                    for (n, e), d in distance_by_direction.items()}

            border = [
                (i - rmin, j - cmin) for (i, j) in points
                if (i - 1, j) not in points or (i + 1, j) not in points or (
                    i, j - 1) not in points or (i, j + 1) not in points
            ]

            c = t.Counter()
            max_dist = 0
            for r0, c0 in border:
                pred = {(r0, c0): None}
                all_dist = distances_from_focus((r0, c0),
                                                set(border),
                                                dist,
                                                pred=pred)
                for b1 in border:
                    n = b1
                    while pred[n]:
                        n = pred[n]
                        c[n] += 1
                max_dist = max(max_dist, all_dist.max())
            (r0, c0), centrality = c.most_common(1)[0]
            center[hexbin] = (r0 + rmin, c0 + cmin)
            print(hexbin, center[hexbin], centrality)
            lon, lat = transform * (c0 + cmin, r0 + rmin)
            rlat, rlon = h3.h3_to_geo(hexbin)
            print(
                f"Centalic node at ({lon}, {lat}). [Actual center at ({rlon}, {rlat}).]"
            )
            engine.execute(hex.update().where(hex.c.hexbin == hexbin).values({
                "vlatitude":
                lat,
                "vlongitude":
                lon
            }))
            try:
                db.execute(
                    t_dist.insert({
                        "hexbin1": hexbin,
                        "hexbin2": hexbin,
                        "flat_distance": 0.0,
                        "distance": max_dist,
                        "source": 4
                    }))
            except sqlalchemy.exc.IntegrityError:
                pass
Example #7
0
def add_concepticon_references(dataset: pycldf.Wordlist,
                               gloss_languages: t.Mapping[str, str]) -> None:
    """Guess Concepticon links for a multilingual Concept table.

    Fill the concepticonReference column of the dateset's ParameterTable with
    best guesses for Concepticon IDs, based on gloss columns in different
    languages.

    Parameters
    ==========
    dataset: A pycldf.Wordlist with a concepticonReference column in its
        ParameterTable
    gloss_lang: A mapping from ParameterTable column names to ISO-639-1
        language codes that Concepticon has concept lists for (eg. en, fr, de,
        es, zh, pt)

    """
    # TODO: If this function took only dataset["ParameterTable"] and the name
    # of the target column in there as arguments, one could construct examples
    # that just use the Iterable API and therefore look nice as doctests.
    gloss_lists: t.Dict[str, t.List[str]] = {
        column: []
        for column in gloss_languages
    }

    for row in dataset["ParameterTable"]:
        for column, glosses in gloss_lists.items():
            glosses.append(row[column]
                           or '?')  # Concepticon abhors empty glosses.

    targets = {
        language: concepticon.api._get_map_for_language(language, None)
        for language in gloss_languages.values()
    }

    cmaps: t.List[t.Dict[int, t.Tuple[t.List[int], int]], ] = [
        (
            concept_map2(glosses,
                         [i[1] for i in targets[gloss_languages[column]]],
                         similarity_level=2,
                         language=gloss_languages[column]),
            # What a horrendous API! Why can't it return glosses or IDs instead
            # of, as it does now, target-indices so I have to schlepp target along
            # with the results?
            targets[gloss_languages[column]])
        for column, glosses in gloss_lists.items()
    ]

    write_back = []
    for i, row in enumerate(dataset["ParameterTable"]):
        matches = [(m.get(i, ([], 10)), t) for m, t in cmaps]
        best_sim = min(x[0][1] for x in matches)
        best_matches = [
            t[m] for (ms, s), t in matches for m in ms if s <= best_sim
        ]
        c: t.Counter[str] = t.Counter(id for id, string in best_matches)
        if len(c) > 1:
            print(row, best_sim, c.most_common())
            row[dataset.column_names.parameters.
                concepticonReference] = c.most_common(1)[0][0]
        elif len(c) < 1:
            print(row)
        else:
            row[dataset.column_names.parameters.
                concepticonReference] = c.most_common(1)[0][0]
        write_back.append(row)

    dataset.write(ParameterTable=write_back)